From 51f336a0f87ddcf5b145dee49b15322c52699dd3 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:51:30 +0800 Subject: [PATCH 01/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 86d82b1d0..6a075dc4a 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ - + This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). From 060695bb861f5a91f5c0cd3a7fd3676ca7b8a071 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:51:51 +0800 Subject: [PATCH 02/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6a075dc4a..f82273113 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ - + This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). From b2db4d66ce78d0f4e4958c79bafa670c25834e72 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:54:03 +0800 Subject: [PATCH 03/67] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f82273113..27f2c19b7 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ - + This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) @@ -191,7 +191,7 @@ Output your evaluation in the following JSON format: title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, year={2024}, -eprint={}, +eprint={2410.05779}, archivePrefix={arXiv}, primaryClass={cs.IR} } From af4e4156732066807eaaaf621d8b73c77c6c416f Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:54:36 +0800 Subject: [PATCH 04/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 27f2c19b7..42de1c1cb 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ - + This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) From 5e513a71040b1c217990e59d0e98b4aaceeb71e1 Mon Sep 17 00:00:00 2001 From: Larfii <834462287@qq.com> Date: Thu, 10 Oct 2024 12:09:24 +0800 Subject: [PATCH 05/67] update --- lightrag/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index a83afba32..dc497cd44 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG, QueryParam -__version__ = "0.0.1" +__version__ = "0.0.2" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/GraphEdit" diff --git a/setup.py b/setup.py index df1c3cf42..849fabfe9 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ deps.append(line.strip()) setuptools.setup( - name="light-rag", + name="lightrag-hku", url=vars2readme["__url__"], version=vars2readme["__version__"], author=vars2readme["__author__"], From 5cc02cb34f5f06e05d1f31097974f7ddcdfb84ce Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 10 Oct 2024 14:57:32 +0800 Subject: [PATCH 06/67] Revert "first commit" --- lightrag/base.py | 116 -------------------- lightrag/prompt.py | 256 -------------------------------------------- lightrag/storage.py | 246 ------------------------------------------ lightrag/utils.py | 165 ---------------------------- 4 files changed, 783 deletions(-) delete mode 100644 lightrag/base.py delete mode 100644 lightrag/prompt.py delete mode 100644 lightrag/storage.py delete mode 100644 lightrag/utils.py diff --git a/lightrag/base.py b/lightrag/base.py deleted file mode 100644 index 9c0422feb..000000000 --- a/lightrag/base.py +++ /dev/null @@ -1,116 +0,0 @@ -from dataclasses import dataclass, field -from typing import TypedDict, Union, Literal, Generic, TypeVar - -import numpy as np - -from .utils import EmbeddingFunc - -TextChunkSchema = TypedDict( - "TextChunkSchema", - {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}, -) - -T = TypeVar("T") - -@dataclass -class QueryParam: - mode: Literal["local", "global", "hybird", "naive"] = "global" - only_need_context: bool = False - response_type: str = "Multiple Paragraphs" - top_k: int = 60 - max_token_for_text_unit: int = 4000 - max_token_for_global_context: int = 4000 - max_token_for_local_context: int = 4000 - - -@dataclass -class StorageNameSpace: - namespace: str - global_config: dict - - async def index_done_callback(self): - """commit the storage operations after indexing""" - pass - - async def query_done_callback(self): - """commit the storage operations after querying""" - pass - -@dataclass -class BaseVectorStorage(StorageNameSpace): - embedding_func: EmbeddingFunc - meta_fields: set = field(default_factory=set) - - async def query(self, query: str, top_k: int) -> list[dict]: - raise NotImplementedError - - async def upsert(self, data: dict[str, dict]): - """Use 'content' field from value for embedding, use key as id. - If embedding_func is None, use 'embedding' field from value - """ - raise NotImplementedError - -@dataclass -class BaseKVStorage(Generic[T], StorageNameSpace): - async def all_keys(self) -> list[str]: - raise NotImplementedError - - async def get_by_id(self, id: str) -> Union[T, None]: - raise NotImplementedError - - async def get_by_ids( - self, ids: list[str], fields: Union[set[str], None] = None - ) -> list[Union[T, None]]: - raise NotImplementedError - - async def filter_keys(self, data: list[str]) -> set[str]: - """return un-exist keys""" - raise NotImplementedError - - async def upsert(self, data: dict[str, T]): - raise NotImplementedError - - async def drop(self): - raise NotImplementedError - - -@dataclass -class BaseGraphStorage(StorageNameSpace): - async def has_node(self, node_id: str) -> bool: - raise NotImplementedError - - async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: - raise NotImplementedError - - async def node_degree(self, node_id: str) -> int: - raise NotImplementedError - - async def edge_degree(self, src_id: str, tgt_id: str) -> int: - raise NotImplementedError - - async def get_node(self, node_id: str) -> Union[dict, None]: - raise NotImplementedError - - async def get_edge( - self, source_node_id: str, target_node_id: str - ) -> Union[dict, None]: - raise NotImplementedError - - async def get_node_edges( - self, source_node_id: str - ) -> Union[list[tuple[str, str]], None]: - raise NotImplementedError - - async def upsert_node(self, node_id: str, node_data: dict[str, str]): - raise NotImplementedError - - async def upsert_edge( - self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] - ): - raise NotImplementedError - - async def clustering(self, algorithm: str): - raise NotImplementedError - - async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: - raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file diff --git a/lightrag/prompt.py b/lightrag/prompt.py deleted file mode 100644 index 5d28e49c5..000000000 --- a/lightrag/prompt.py +++ /dev/null @@ -1,256 +0,0 @@ -GRAPH_FIELD_SEP = "" - -PROMPTS = {} - -PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" -PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" -PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" -PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] - -PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] - -PROMPTS[ - "entity_extraction" -] = """-Goal- -Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. - --Steps- -1. Identify all entities. For each identified entity, extract the following information: -- entity_name: Name of the entity, capitalized -- entity_type: One of the following types: [{entity_types}] -- entity_description: Comprehensive description of the entity's attributes and activities -Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} - -2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. -For each pair of related entities, extract the following information: -- source_entity: name of the source entity, as identified in step 1 -- target_entity: name of the target entity, as identified in step 1 -- relationship_description: explanation as to why you think the source entity and the target entity are related to each other -- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity -- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details -Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. -Format the content-level key words as ("content_keywords"{tuple_delimiter}) - -4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. - -5. When finished, output {completion_delimiter} - -###################### --Examples- -###################### -Example 1: - -Entity_types: [person, technology, mission, organization, location] -Text: -while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. - -Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” - -The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. - -It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths -################ -Output: -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} -("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} -("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} -("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} -############################# -Example 2: - -Entity_types: [person, technology, mission, organization, location] -Text: -They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. - -Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. - -Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly -############# -Output: -("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} -("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} -("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} -("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} -############################# -Example 3: - -Entity_types: [person, role, technology, organization, event, location, concept] -Text: -their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. - -"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." - -Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." - -Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. - -The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation -############# -Output: -("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} -("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} -("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} -("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} -("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} -("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} -("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} -############################# --Real Data- -###################### -Entity_types: {entity_types} -Text: {input_text} -###################### -Output: -""" - -PROMPTS[ - "summarize_entity_descriptions" -] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. -Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. -Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. -If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. -Make sure it is written in third person, and include the entity names so we the have full context. - -####### --Data- -Entities: {entity_name} -Description List: {description_list} -####### -Output: -""" - -PROMPTS[ - "entiti_continue_extraction" -] = """MANY entities were missed in the last extraction. Add them below using the same format: -""" - -PROMPTS[ - "entiti_if_loop_extraction" -] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. -""" - -PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." - -PROMPTS[ - "rag_response" -] = """---Role--- - -You are a helpful assistant responding to questions about data in the tables provided. - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. - ----Target response length and format--- - -{response_type} - - ----Data tables--- - -{context_data} - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. - -If you don't know the answer, just say so. Do not make anything up. - -Do not include information where the supporting evidence for it is not provided. - - ----Target response length and format--- - -{response_type} - -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" - -PROMPTS["keywords_extraction"] = """---Role--- - -You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. - ----Goal--- - -Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. - ----Instructions--- - -- Output the keywords in JSON format. -- The JSON should have two keys: - - "high_level_keywords" for overarching concepts or themes. - - "low_level_keywords" for specific entities or details. - -###################### --Examples- -###################### -Example 1: - -Query: "How does international trade influence global economic stability?" -################ -Output: -{{ - "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], - "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] -}} -############################# -Example 2: - -Query: "What are the environmental consequences of deforestation on biodiversity?" -################ -Output: -{{ - "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], - "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] -}} -############################# -Example 3: - -Query: "What is the role of education in reducing poverty?" -################ -Output: -{{ - "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], - "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] -}} -############################# --Real Data- -###################### -Query: {query} -###################### -Output: - -""" - -PROMPTS[ - "naive_rag_response" -] = """You're a helpful assistant -Below are the knowledge you know: -{content_data} ---- -If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. ----Target response length and format--- -{response_type} -""" diff --git a/lightrag/storage.py b/lightrag/storage.py deleted file mode 100644 index 2f2bb7d8f..000000000 --- a/lightrag/storage.py +++ /dev/null @@ -1,246 +0,0 @@ -import asyncio -import html -import json -import os -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any, Union, cast -import pickle -import hnswlib -import networkx as nx -import numpy as np -from nano_vectordb import NanoVectorDB -import xxhash - -from .utils import load_json, logger, write_json -from .base import ( - BaseGraphStorage, - BaseKVStorage, - BaseVectorStorage, -) - -@dataclass -class JsonKVStorage(BaseKVStorage): - def __post_init__(self): - working_dir = self.global_config["working_dir"] - self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") - self._data = load_json(self._file_name) or {} - logger.info(f"Load KV {self.namespace} with {len(self._data)} data") - - async def all_keys(self) -> list[str]: - return list(self._data.keys()) - - async def index_done_callback(self): - write_json(self._data, self._file_name) - - async def get_by_id(self, id): - return self._data.get(id, None) - - async def get_by_ids(self, ids, fields=None): - if fields is None: - return [self._data.get(id, None) for id in ids] - return [ - ( - {k: v for k, v in self._data[id].items() if k in fields} - if self._data.get(id, None) - else None - ) - for id in ids - ] - - async def filter_keys(self, data: list[str]) -> set[str]: - return set([s for s in data if s not in self._data]) - - async def upsert(self, data: dict[str, dict]): - left_data = {k: v for k, v in data.items() if k not in self._data} - self._data.update(left_data) - return left_data - - async def drop(self): - self._data = {} - -@dataclass -class NanoVectorDBStorage(BaseVectorStorage): - cosine_better_than_threshold: float = 0.2 - - def __post_init__(self): - - self._client_file_name = os.path.join( - self.global_config["working_dir"], f"vdb_{self.namespace}.json" - ) - self._max_batch_size = self.global_config["embedding_batch_num"] - self._client = NanoVectorDB( - self.embedding_func.embedding_dim, storage_file=self._client_file_name - ) - self.cosine_better_than_threshold = self.global_config.get( - "cosine_better_than_threshold", self.cosine_better_than_threshold - ) - - async def upsert(self, data: dict[str, dict]): - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] - list_data = [ - { - "__id__": k, - **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, - } - for k, v in data.items() - ] - contents = [v["content"] for v in data.values()] - batches = [ - contents[i : i + self._max_batch_size] - for i in range(0, len(contents), self._max_batch_size) - ] - embeddings_list = await asyncio.gather( - *[self.embedding_func(batch) for batch in batches] - ) - embeddings = np.concatenate(embeddings_list) - for i, d in enumerate(list_data): - d["__vector__"] = embeddings[i] - results = self._client.upsert(datas=list_data) - return results - - async def query(self, query: str, top_k=5): - embedding = await self.embedding_func([query]) - embedding = embedding[0] - results = self._client.query( - query=embedding, - top_k=top_k, - better_than_threshold=self.cosine_better_than_threshold, - ) - results = [ - {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results - ] - return results - - async def index_done_callback(self): - self._client.save() - -@dataclass -class NetworkXStorage(BaseGraphStorage): - @staticmethod - def load_nx_graph(file_name) -> nx.Graph: - if os.path.exists(file_name): - return nx.read_graphml(file_name) - return None - - @staticmethod - def write_nx_graph(graph: nx.Graph, file_name): - logger.info( - f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges" - ) - nx.write_graphml(graph, file_name) - - @staticmethod - def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: - """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py - Return the largest connected component of the graph, with nodes and edges sorted in a stable way. - """ - from graspologic.utils import largest_connected_component - - graph = graph.copy() - graph = cast(nx.Graph, largest_connected_component(graph)) - node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore - graph = nx.relabel_nodes(graph, node_mapping) - return NetworkXStorage._stabilize_graph(graph) - - @staticmethod - def _stabilize_graph(graph: nx.Graph) -> nx.Graph: - """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py - Ensure an undirected graph with the same relationships will always be read the same way. - """ - fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph() - - sorted_nodes = graph.nodes(data=True) - sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0]) - - fixed_graph.add_nodes_from(sorted_nodes) - edges = list(graph.edges(data=True)) - - if not graph.is_directed(): - - def _sort_source_target(edge): - source, target, edge_data = edge - if source > target: - temp = source - source = target - target = temp - return source, target, edge_data - - edges = [_sort_source_target(edge) for edge in edges] - - def _get_edge_key(source: Any, target: Any) -> str: - return f"{source} -> {target}" - - edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1])) - - fixed_graph.add_edges_from(edges) - return fixed_graph - - def __post_init__(self): - self._graphml_xml_file = os.path.join( - self.global_config["working_dir"], f"graph_{self.namespace}.graphml" - ) - preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) - if preloaded_graph is not None: - logger.info( - f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges" - ) - self._graph = preloaded_graph or nx.Graph() - self._node_embed_algorithms = { - "node2vec": self._node2vec_embed, - } - - async def index_done_callback(self): - NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) - - async def has_node(self, node_id: str) -> bool: - return self._graph.has_node(node_id) - - async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: - return self._graph.has_edge(source_node_id, target_node_id) - - async def get_node(self, node_id: str) -> Union[dict, None]: - return self._graph.nodes.get(node_id) - - async def node_degree(self, node_id: str) -> int: - return self._graph.degree(node_id) - - async def edge_degree(self, src_id: str, tgt_id: str) -> int: - return self._graph.degree(src_id) + self._graph.degree(tgt_id) - - async def get_edge( - self, source_node_id: str, target_node_id: str - ) -> Union[dict, None]: - return self._graph.edges.get((source_node_id, target_node_id)) - - async def get_node_edges(self, source_node_id: str): - if self._graph.has_node(source_node_id): - return list(self._graph.edges(source_node_id)) - return None - - async def upsert_node(self, node_id: str, node_data: dict[str, str]): - self._graph.add_node(node_id, **node_data) - - async def upsert_edge( - self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] - ): - self._graph.add_edge(source_node_id, target_node_id, **edge_data) - - async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: - if algorithm not in self._node_embed_algorithms: - raise ValueError(f"Node embedding algorithm {algorithm} not supported") - return await self._node_embed_algorithms[algorithm]() - - async def _node2vec_embed(self): - from graspologic import embed - - embeddings, nodes = embed.node2vec_embed( - self._graph, - **self.global_config["node2vec_params"], - ) - - nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] - return embeddings, nodes_ids diff --git a/lightrag/utils.py b/lightrag/utils.py deleted file mode 100644 index c75b4270c..000000000 --- a/lightrag/utils.py +++ /dev/null @@ -1,165 +0,0 @@ -import asyncio -import html -import json -import logging -import os -import re -from dataclasses import dataclass -from functools import wraps -from hashlib import md5 -from typing import Any, Union - -import numpy as np -import tiktoken - -ENCODER = None - -logger = logging.getLogger("lightrag") - -def set_logger(log_file: str): - logger.setLevel(logging.DEBUG) - - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(logging.DEBUG) - - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - file_handler.setFormatter(formatter) - - if not logger.handlers: - logger.addHandler(file_handler) - -@dataclass -class EmbeddingFunc: - embedding_dim: int - max_token_size: int - func: callable - - async def __call__(self, *args, **kwargs) -> np.ndarray: - return await self.func(*args, **kwargs) - -def locate_json_string_body_from_string(content: str) -> Union[str, None]: - """Locate the JSON string body from a string""" - maybe_json_str = re.search(r"{.*}", content, re.DOTALL) - if maybe_json_str is not None: - return maybe_json_str.group(0) - else: - return None - -def convert_response_to_json(response: str) -> dict: - json_str = locate_json_string_body_from_string(response) - assert json_str is not None, f"Unable to parse JSON from response: {response}" - try: - data = json.loads(json_str) - return data - except json.JSONDecodeError as e: - logger.error(f"Failed to parse JSON: {json_str}") - raise e from None - -def compute_args_hash(*args): - return md5(str(args).encode()).hexdigest() - -def compute_mdhash_id(content, prefix: str = ""): - return prefix + md5(content.encode()).hexdigest() - -def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): - """Add restriction of maximum async calling times for a async func""" - - def final_decro(func): - """Not using async.Semaphore to aovid use nest-asyncio""" - __current_size = 0 - - @wraps(func) - async def wait_func(*args, **kwargs): - nonlocal __current_size - while __current_size >= max_size: - await asyncio.sleep(waitting_time) - __current_size += 1 - result = await func(*args, **kwargs) - __current_size -= 1 - return result - - return wait_func - - return final_decro - -def wrap_embedding_func_with_attrs(**kwargs): - """Wrap a function with attributes""" - - def final_decro(func) -> EmbeddingFunc: - new_func = EmbeddingFunc(**kwargs, func=func) - return new_func - - return final_decro - -def load_json(file_name): - if not os.path.exists(file_name): - return None - with open(file_name) as f: - return json.load(f) - -def write_json(json_obj, file_name): - with open(file_name, "w") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False) - -def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): - global ENCODER - if ENCODER is None: - ENCODER = tiktoken.encoding_for_model(model_name) - tokens = ENCODER.encode(content) - return tokens - - -def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): - global ENCODER - if ENCODER is None: - ENCODER = tiktoken.encoding_for_model(model_name) - content = ENCODER.decode(tokens) - return content - -def pack_user_ass_to_openai_messages(*args: str): - roles = ["user", "assistant"] - return [ - {"role": roles[i % 2], "content": content} for i, content in enumerate(args) - ] - -def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: - """Split a string by multiple markers""" - if not markers: - return [content] - results = re.split("|".join(re.escape(marker) for marker in markers), content) - return [r.strip() for r in results if r.strip()] - -# Refer the utils functions of the official GraphRAG implementation: -# https://github.com/microsoft/graphrag -def clean_str(input: Any) -> str: - """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" - # If we get non-string input, just give it back - if not isinstance(input, str): - return input - - result = html.unescape(input.strip()) - # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python - return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) - -def is_float_regex(value): - return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) - -def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): - """Truncate a list of data by token size""" - if max_token_size <= 0: - return [] - tokens = 0 - for i, data in enumerate(list_data): - tokens += len(encode_string_by_tiktoken(key(data))) - if tokens > max_token_size: - return list_data[:i] - return list_data - -def list_of_list_to_csv(data: list[list]): - return "\n".join( - [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] - ) - -def save_data_to_file(data, file_name): - with open(file_name, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file From 86276325178e37de7eddfd27513a528b33932dcb Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 10 Oct 2024 14:58:21 +0800 Subject: [PATCH 07/67] update --- lightrag/__init__.py | 2 +- lightrag/base.py | 116 ++++++++++++++++++++ lightrag/prompt.py | 256 +++++++++++++++++++++++++++++++++++++++++++ lightrag/storage.py | 246 +++++++++++++++++++++++++++++++++++++++++ lightrag/utils.py | 165 ++++++++++++++++++++++++++++ setup.py | 2 +- 6 files changed, 785 insertions(+), 2 deletions(-) create mode 100644 lightrag/base.py create mode 100644 lightrag/prompt.py create mode 100644 lightrag/storage.py create mode 100644 lightrag/utils.py diff --git a/lightrag/__init__.py b/lightrag/__init__.py index a83afba32..dc497cd44 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG, QueryParam -__version__ = "0.0.1" +__version__ = "0.0.2" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/GraphEdit" diff --git a/lightrag/base.py b/lightrag/base.py new file mode 100644 index 000000000..9c0422feb --- /dev/null +++ b/lightrag/base.py @@ -0,0 +1,116 @@ +from dataclasses import dataclass, field +from typing import TypedDict, Union, Literal, Generic, TypeVar + +import numpy as np + +from .utils import EmbeddingFunc + +TextChunkSchema = TypedDict( + "TextChunkSchema", + {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}, +) + +T = TypeVar("T") + +@dataclass +class QueryParam: + mode: Literal["local", "global", "hybird", "naive"] = "global" + only_need_context: bool = False + response_type: str = "Multiple Paragraphs" + top_k: int = 60 + max_token_for_text_unit: int = 4000 + max_token_for_global_context: int = 4000 + max_token_for_local_context: int = 4000 + + +@dataclass +class StorageNameSpace: + namespace: str + global_config: dict + + async def index_done_callback(self): + """commit the storage operations after indexing""" + pass + + async def query_done_callback(self): + """commit the storage operations after querying""" + pass + +@dataclass +class BaseVectorStorage(StorageNameSpace): + embedding_func: EmbeddingFunc + meta_fields: set = field(default_factory=set) + + async def query(self, query: str, top_k: int) -> list[dict]: + raise NotImplementedError + + async def upsert(self, data: dict[str, dict]): + """Use 'content' field from value for embedding, use key as id. + If embedding_func is None, use 'embedding' field from value + """ + raise NotImplementedError + +@dataclass +class BaseKVStorage(Generic[T], StorageNameSpace): + async def all_keys(self) -> list[str]: + raise NotImplementedError + + async def get_by_id(self, id: str) -> Union[T, None]: + raise NotImplementedError + + async def get_by_ids( + self, ids: list[str], fields: Union[set[str], None] = None + ) -> list[Union[T, None]]: + raise NotImplementedError + + async def filter_keys(self, data: list[str]) -> set[str]: + """return un-exist keys""" + raise NotImplementedError + + async def upsert(self, data: dict[str, T]): + raise NotImplementedError + + async def drop(self): + raise NotImplementedError + + +@dataclass +class BaseGraphStorage(StorageNameSpace): + async def has_node(self, node_id: str) -> bool: + raise NotImplementedError + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + raise NotImplementedError + + async def node_degree(self, node_id: str) -> int: + raise NotImplementedError + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + raise NotImplementedError + + async def get_node(self, node_id: str) -> Union[dict, None]: + raise NotImplementedError + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + raise NotImplementedError + + async def get_node_edges( + self, source_node_id: str + ) -> Union[list[tuple[str, str]], None]: + raise NotImplementedError + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + raise NotImplementedError + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + raise NotImplementedError + + async def clustering(self, algorithm: str): + raise NotImplementedError + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file diff --git a/lightrag/prompt.py b/lightrag/prompt.py new file mode 100644 index 000000000..5d28e49c5 --- /dev/null +++ b/lightrag/prompt.py @@ -0,0 +1,256 @@ +GRAPH_FIELD_SEP = "" + +PROMPTS = {} + +PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" +PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" +PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" +PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] + +PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] + +PROMPTS[ + "entity_extraction" +] = """-Goal- +Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. + +-Steps- +1. Identify all entities. For each identified entity, extract the following information: +- entity_name: Name of the entity, capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_description: Comprehensive description of the entity's attributes and activities +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity +- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +###################### +-Examples- +###################### +Example 1: + +Entity_types: [person, technology, mission, organization, location] +Text: +while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. + +Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” + +The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. + +It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths +################ +Output: +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} +("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} +("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} +("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} +############################# +Example 2: + +Entity_types: [person, technology, mission, organization, location] +Text: +They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. + +Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. + +Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly +############# +Output: +("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} +("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} +("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} +("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} +############################# +Example 3: + +Entity_types: [person, role, technology, organization, event, location, concept] +Text: +their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. + +"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." + +Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." + +Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. + +The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation +############# +Output: +("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} +("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} +("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} +("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} +("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} +("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} +("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +PROMPTS[ + "summarize_entity_descriptions" +] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +PROMPTS[ + "entiti_continue_extraction" +] = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +PROMPTS[ + "entiti_if_loop_extraction" +] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." + +PROMPTS[ + "rag_response" +] = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + + +---Data tables--- + +{context_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Do not include information where the supporting evidence for it is not provided. + + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +PROMPTS["keywords_extraction"] = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. + +---Goal--- + +Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. + +---Instructions--- + +- Output the keywords in JSON format. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or details. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does international trade influence global economic stability?" +################ +Output: +{{ + "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], + "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] +}} +############################# +Example 2: + +Query: "What are the environmental consequences of deforestation on biodiversity?" +################ +Output: +{{ + "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], + "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] +}} +############################# +Example 3: + +Query: "What is the role of education in reducing poverty?" +################ +Output: +{{ + "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], + "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: + +""" + +PROMPTS[ + "naive_rag_response" +] = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/storage.py b/lightrag/storage.py new file mode 100644 index 000000000..2f2bb7d8f --- /dev/null +++ b/lightrag/storage.py @@ -0,0 +1,246 @@ +import asyncio +import html +import json +import os +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Union, cast +import pickle +import hnswlib +import networkx as nx +import numpy as np +from nano_vectordb import NanoVectorDB +import xxhash + +from .utils import load_json, logger, write_json +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, +) + +@dataclass +class JsonKVStorage(BaseKVStorage): + def __post_init__(self): + working_dir = self.global_config["working_dir"] + self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") + self._data = load_json(self._file_name) or {} + logger.info(f"Load KV {self.namespace} with {len(self._data)} data") + + async def all_keys(self) -> list[str]: + return list(self._data.keys()) + + async def index_done_callback(self): + write_json(self._data, self._file_name) + + async def get_by_id(self, id): + return self._data.get(id, None) + + async def get_by_ids(self, ids, fields=None): + if fields is None: + return [self._data.get(id, None) for id in ids] + return [ + ( + {k: v for k, v in self._data[id].items() if k in fields} + if self._data.get(id, None) + else None + ) + for id in ids + ] + + async def filter_keys(self, data: list[str]) -> set[str]: + return set([s for s in data if s not in self._data]) + + async def upsert(self, data: dict[str, dict]): + left_data = {k: v for k, v in data.items() if k not in self._data} + self._data.update(left_data) + return left_data + + async def drop(self): + self._data = {} + +@dataclass +class NanoVectorDBStorage(BaseVectorStorage): + cosine_better_than_threshold: float = 0.2 + + def __post_init__(self): + + self._client_file_name = os.path.join( + self.global_config["working_dir"], f"vdb_{self.namespace}.json" + ) + self._max_batch_size = self.global_config["embedding_batch_num"] + self._client = NanoVectorDB( + self.embedding_func.embedding_dim, storage_file=self._client_file_name + ) + self.cosine_better_than_threshold = self.global_config.get( + "cosine_better_than_threshold", self.cosine_better_than_threshold + ) + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + if not len(data): + logger.warning("You insert an empty data to vector DB") + return [] + list_data = [ + { + "__id__": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + for i, d in enumerate(list_data): + d["__vector__"] = embeddings[i] + results = self._client.upsert(datas=list_data) + return results + + async def query(self, query: str, top_k=5): + embedding = await self.embedding_func([query]) + embedding = embedding[0] + results = self._client.query( + query=embedding, + top_k=top_k, + better_than_threshold=self.cosine_better_than_threshold, + ) + results = [ + {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results + ] + return results + + async def index_done_callback(self): + self._client.save() + +@dataclass +class NetworkXStorage(BaseGraphStorage): + @staticmethod + def load_nx_graph(file_name) -> nx.Graph: + if os.path.exists(file_name): + return nx.read_graphml(file_name) + return None + + @staticmethod + def write_nx_graph(graph: nx.Graph, file_name): + logger.info( + f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges" + ) + nx.write_graphml(graph, file_name) + + @staticmethod + def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Return the largest connected component of the graph, with nodes and edges sorted in a stable way. + """ + from graspologic.utils import largest_connected_component + + graph = graph.copy() + graph = cast(nx.Graph, largest_connected_component(graph)) + node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore + graph = nx.relabel_nodes(graph, node_mapping) + return NetworkXStorage._stabilize_graph(graph) + + @staticmethod + def _stabilize_graph(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Ensure an undirected graph with the same relationships will always be read the same way. + """ + fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph() + + sorted_nodes = graph.nodes(data=True) + sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0]) + + fixed_graph.add_nodes_from(sorted_nodes) + edges = list(graph.edges(data=True)) + + if not graph.is_directed(): + + def _sort_source_target(edge): + source, target, edge_data = edge + if source > target: + temp = source + source = target + target = temp + return source, target, edge_data + + edges = [_sort_source_target(edge) for edge in edges] + + def _get_edge_key(source: Any, target: Any) -> str: + return f"{source} -> {target}" + + edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1])) + + fixed_graph.add_edges_from(edges) + return fixed_graph + + def __post_init__(self): + self._graphml_xml_file = os.path.join( + self.global_config["working_dir"], f"graph_{self.namespace}.graphml" + ) + preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) + if preloaded_graph is not None: + logger.info( + f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges" + ) + self._graph = preloaded_graph or nx.Graph() + self._node_embed_algorithms = { + "node2vec": self._node2vec_embed, + } + + async def index_done_callback(self): + NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) + + async def has_node(self, node_id: str) -> bool: + return self._graph.has_node(node_id) + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + return self._graph.has_edge(source_node_id, target_node_id) + + async def get_node(self, node_id: str) -> Union[dict, None]: + return self._graph.nodes.get(node_id) + + async def node_degree(self, node_id: str) -> int: + return self._graph.degree(node_id) + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + return self._graph.degree(src_id) + self._graph.degree(tgt_id) + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + return self._graph.edges.get((source_node_id, target_node_id)) + + async def get_node_edges(self, source_node_id: str): + if self._graph.has_node(source_node_id): + return list(self._graph.edges(source_node_id)) + return None + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + self._graph.add_node(node_id, **node_data) + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + self._graph.add_edge(source_node_id, target_node_id, **edge_data) + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + if algorithm not in self._node_embed_algorithms: + raise ValueError(f"Node embedding algorithm {algorithm} not supported") + return await self._node_embed_algorithms[algorithm]() + + async def _node2vec_embed(self): + from graspologic import embed + + embeddings, nodes = embed.node2vec_embed( + self._graph, + **self.global_config["node2vec_params"], + ) + + nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] + return embeddings, nodes_ids diff --git a/lightrag/utils.py b/lightrag/utils.py new file mode 100644 index 000000000..c75b4270c --- /dev/null +++ b/lightrag/utils.py @@ -0,0 +1,165 @@ +import asyncio +import html +import json +import logging +import os +import re +from dataclasses import dataclass +from functools import wraps +from hashlib import md5 +from typing import Any, Union + +import numpy as np +import tiktoken + +ENCODER = None + +logger = logging.getLogger("lightrag") + +def set_logger(log_file: str): + logger.setLevel(logging.DEBUG) + + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.DEBUG) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + + if not logger.handlers: + logger.addHandler(file_handler) + +@dataclass +class EmbeddingFunc: + embedding_dim: int + max_token_size: int + func: callable + + async def __call__(self, *args, **kwargs) -> np.ndarray: + return await self.func(*args, **kwargs) + +def locate_json_string_body_from_string(content: str) -> Union[str, None]: + """Locate the JSON string body from a string""" + maybe_json_str = re.search(r"{.*}", content, re.DOTALL) + if maybe_json_str is not None: + return maybe_json_str.group(0) + else: + return None + +def convert_response_to_json(response: str) -> dict: + json_str = locate_json_string_body_from_string(response) + assert json_str is not None, f"Unable to parse JSON from response: {response}" + try: + data = json.loads(json_str) + return data + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON: {json_str}") + raise e from None + +def compute_args_hash(*args): + return md5(str(args).encode()).hexdigest() + +def compute_mdhash_id(content, prefix: str = ""): + return prefix + md5(content.encode()).hexdigest() + +def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): + """Add restriction of maximum async calling times for a async func""" + + def final_decro(func): + """Not using async.Semaphore to aovid use nest-asyncio""" + __current_size = 0 + + @wraps(func) + async def wait_func(*args, **kwargs): + nonlocal __current_size + while __current_size >= max_size: + await asyncio.sleep(waitting_time) + __current_size += 1 + result = await func(*args, **kwargs) + __current_size -= 1 + return result + + return wait_func + + return final_decro + +def wrap_embedding_func_with_attrs(**kwargs): + """Wrap a function with attributes""" + + def final_decro(func) -> EmbeddingFunc: + new_func = EmbeddingFunc(**kwargs, func=func) + return new_func + + return final_decro + +def load_json(file_name): + if not os.path.exists(file_name): + return None + with open(file_name) as f: + return json.load(f) + +def write_json(json_obj, file_name): + with open(file_name, "w") as f: + json.dump(json_obj, f, indent=2, ensure_ascii=False) + +def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): + global ENCODER + if ENCODER is None: + ENCODER = tiktoken.encoding_for_model(model_name) + tokens = ENCODER.encode(content) + return tokens + + +def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): + global ENCODER + if ENCODER is None: + ENCODER = tiktoken.encoding_for_model(model_name) + content = ENCODER.decode(tokens) + return content + +def pack_user_ass_to_openai_messages(*args: str): + roles = ["user", "assistant"] + return [ + {"role": roles[i % 2], "content": content} for i, content in enumerate(args) + ] + +def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: + """Split a string by multiple markers""" + if not markers: + return [content] + results = re.split("|".join(re.escape(marker) for marker in markers), content) + return [r.strip() for r in results if r.strip()] + +# Refer the utils functions of the official GraphRAG implementation: +# https://github.com/microsoft/graphrag +def clean_str(input: Any) -> str: + """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" + # If we get non-string input, just give it back + if not isinstance(input, str): + return input + + result = html.unescape(input.strip()) + # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python + return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + +def is_float_regex(value): + return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + +def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): + """Truncate a list of data by token size""" + if max_token_size <= 0: + return [] + tokens = 0 + for i, data in enumerate(list_data): + tokens += len(encode_string_by_tiktoken(key(data))) + if tokens > max_token_size: + return list_data[:i] + return list_data + +def list_of_list_to_csv(data: list[list]): + return "\n".join( + [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] + ) + +def save_data_to_file(data, file_name): + with open(file_name, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file diff --git a/setup.py b/setup.py index df1c3cf42..849fabfe9 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ deps.append(line.strip()) setuptools.setup( - name="light-rag", + name="lightrag-hku", url=vars2readme["__url__"], version=vars2readme["__version__"], author=vars2readme["__author__"], From 5931e4bccb90bfe9517e4f478f1ba10170be77ec Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 10 Oct 2024 15:00:31 +0800 Subject: [PATCH 08/67] Revert "first commit" --- README.md | 198 --------- lightrag/__init__.py | 5 - lightrag/base.py | 116 ------ lightrag/lightrag.py | 300 -------------- lightrag/llm.py | 88 ---- lightrag/operate.py | 944 ------------------------------------------- lightrag/prompt.py | 256 ------------ lightrag/storage.py | 246 ----------- lightrag/utils.py | 165 -------- 9 files changed, 2318 deletions(-) delete mode 100644 README.md delete mode 100644 lightrag/__init__.py delete mode 100644 lightrag/base.py delete mode 100644 lightrag/lightrag.py delete mode 100644 lightrag/llm.py delete mode 100644 lightrag/operate.py delete mode 100644 lightrag/prompt.py delete mode 100644 lightrag/storage.py delete mode 100644 lightrag/utils.py diff --git a/README.md b/README.md deleted file mode 100644 index 42de1c1cb..000000000 --- a/README.md +++ /dev/null @@ -1,198 +0,0 @@ -# LightRAG: Simple and Fast Retrieval-Augmented Generation -![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) - - - - - - -This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). -![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) -## Install - -* Install from source - -```bash -cd LightRAG -pip install -e . -``` -* Install from PyPI -```bash -pip install lightrag-hku -``` - -## Quick Start - -* Set OpenAI API key in environment: `export OPENAI_API_KEY="sk-...".` -* Download the demo text "A Christmas Carol by Charles Dickens" -```bash -curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt -``` -Use the below python snippet: - -```python -from lightrag import LightRAG, QueryParam - -rag = LightRAG(working_dir="./dickens") - -with open("./book.txt") as f: - rag.insert(f.read()) - -# Perform naive search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) - -# Perform local search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) - -# Perform global search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) - -# Perform hybird search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybird"))) -``` -Batch Insert -```python -rag.insert(["TEXT1", "TEXT2",...]) -``` -Incremental Insert - -```python -rag = LightRAG(working_dir="./dickens") - -with open("./newText.txt") as f: - rag.insert(f.read()) -``` -## Evaluation -### Dataset -The dataset used in LightRAG can be download from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). - -### Generate Query -LightRAG uses the following prompt to generate high-level queries, with the corresponding code located in `example/generate_query.py`. -```python -Given the following description of a dataset: - -{description} - -Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. - -Output the results in the following structure: -- User 1: [user description] - - Task 1: [task description] - - Question 1: - - Question 2: - - Question 3: - - Question 4: - - Question 5: - - Task 2: [task description] - ... - - Task 5: [task description] -- User 2: [user description] - ... -- User 5: [user description] - ... -``` - - ### Batch Eval -To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`. -```python ----Role--- -You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. ----Goal--- -You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. - -- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? -- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? -- **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? - -For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories. - -Here is the question: -{query} - -Here are the two answers: - -**Answer 1:** -{answer1} - -**Answer 2:** -{answer2} - -Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. - -Output your evaluation in the following JSON format: - -{{ - "Comprehensiveness": {{ - "Winner": "[Answer 1 or Answer 2]", - "Explanation": "[Provide explanation here]" - }}, - "Empowerment": {{ - "Winner": "[Answer 1 or Answer 2]", - "Explanation": "[Provide explanation here]" - }}, - "Overall Winner": {{ - "Winner": "[Answer 1 or Answer 2]", - "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]" - }} -}} -``` -### Overall Performance Table -### Overall Performance Table -| | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | -|----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| -| | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | -| **Comprehensiveness** | 32.69% | **67.31%** | 35.44% | **64.56%** | 19.05% | **80.95%** | 36.36% | **63.64%** | -| **Diversity** | 24.09% | **75.91%** | 35.24% | **64.76%** | 10.98% | **89.02%** | 30.76% | **69.24%** | -| **Empowerment** | 31.35% | **68.65%** | 35.48% | **64.52%** | 17.59% | **82.41%** | 40.95% | **59.05%** | -| **Overall** | 33.30% | **66.70%** | 34.76% | **65.24%** | 17.46% | **82.54%** | 37.59% | **62.40%** | -| | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | -| **Comprehensiveness** | 32.05% | **67.95%** | 39.30% | **60.70%** | 18.57% | **81.43%** | 38.89% | **61.11%** | -| **Diversity** | 29.44% | **70.56%** | 38.71% | **61.29%** | 15.14% | **84.86%** | 28.50% | **71.50%** | -| **Empowerment** | 32.51% | **67.49%** | 37.52% | **62.48%** | 17.80% | **82.20%** | 43.96% | **56.04%** | -| **Overall** | 33.29% | **66.71%** | 39.03% | **60.97%** | 17.80% | **82.20%** | 39.61% | **60.39%** | -| | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | -| **Comprehensiveness** | 24.39% | **75.61%** | 36.49% | **63.51%** | 27.68% | **72.32%** | 42.17% | **57.83%** | -| **Diversity** | 24.96% | **75.34%** | 37.41% | **62.59%** | 18.79% | **81.21%** | 30.88% | **69.12%** | -| **Empowerment** | 24.89% | **75.11%** | 34.99% | **65.01%** | 26.99% | **73.01%** | **45.61%** | **54.39%** | -| **Overall** | 23.17% | **76.83%** | 35.67% | **64.33%** | 27.68% | **72.32%** | 42.72% | **57.28%** | -| | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | -| **Comprehensiveness** | 45.56% | **54.44%** | 45.98% | **54.02%** | 47.13% | **52.87%** | **51.86%** | 48.14% | -| **Diversity** | 19.65% | **80.35%** | 39.64% | **60.36%** | 25.55% | **74.45%** | 35.87% | **64.13%** | -| **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | -| **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | - -## Code Structure - -```python -. -├── examples -│ ├── batch_eval.py -│ ├── generate_query.py -│ ├── insert.py -│ └── query.py -├── lightrag -│ ├── __init__.py -│ ├── base.py -│ ├── lightrag.py -│ ├── llm.py -│ ├── operate.py -│ ├── prompt.py -│ ├── storage.py -│ └── utils.jpeg -├── LICENSE -├── README.md -├── requirements.txt -└── setup.py -``` -## Citation - -``` -@article{guo2024lightrag, -title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, -author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, -year={2024}, -eprint={2410.05779}, -archivePrefix={arXiv}, -primaryClass={cs.IR} -} -``` diff --git a/lightrag/__init__.py b/lightrag/__init__.py deleted file mode 100644 index dc497cd44..000000000 --- a/lightrag/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .lightrag import LightRAG, QueryParam - -__version__ = "0.0.2" -__author__ = "Zirui Guo" -__url__ = "https://github.com/HKUDS/GraphEdit" diff --git a/lightrag/base.py b/lightrag/base.py deleted file mode 100644 index 9c0422feb..000000000 --- a/lightrag/base.py +++ /dev/null @@ -1,116 +0,0 @@ -from dataclasses import dataclass, field -from typing import TypedDict, Union, Literal, Generic, TypeVar - -import numpy as np - -from .utils import EmbeddingFunc - -TextChunkSchema = TypedDict( - "TextChunkSchema", - {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}, -) - -T = TypeVar("T") - -@dataclass -class QueryParam: - mode: Literal["local", "global", "hybird", "naive"] = "global" - only_need_context: bool = False - response_type: str = "Multiple Paragraphs" - top_k: int = 60 - max_token_for_text_unit: int = 4000 - max_token_for_global_context: int = 4000 - max_token_for_local_context: int = 4000 - - -@dataclass -class StorageNameSpace: - namespace: str - global_config: dict - - async def index_done_callback(self): - """commit the storage operations after indexing""" - pass - - async def query_done_callback(self): - """commit the storage operations after querying""" - pass - -@dataclass -class BaseVectorStorage(StorageNameSpace): - embedding_func: EmbeddingFunc - meta_fields: set = field(default_factory=set) - - async def query(self, query: str, top_k: int) -> list[dict]: - raise NotImplementedError - - async def upsert(self, data: dict[str, dict]): - """Use 'content' field from value for embedding, use key as id. - If embedding_func is None, use 'embedding' field from value - """ - raise NotImplementedError - -@dataclass -class BaseKVStorage(Generic[T], StorageNameSpace): - async def all_keys(self) -> list[str]: - raise NotImplementedError - - async def get_by_id(self, id: str) -> Union[T, None]: - raise NotImplementedError - - async def get_by_ids( - self, ids: list[str], fields: Union[set[str], None] = None - ) -> list[Union[T, None]]: - raise NotImplementedError - - async def filter_keys(self, data: list[str]) -> set[str]: - """return un-exist keys""" - raise NotImplementedError - - async def upsert(self, data: dict[str, T]): - raise NotImplementedError - - async def drop(self): - raise NotImplementedError - - -@dataclass -class BaseGraphStorage(StorageNameSpace): - async def has_node(self, node_id: str) -> bool: - raise NotImplementedError - - async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: - raise NotImplementedError - - async def node_degree(self, node_id: str) -> int: - raise NotImplementedError - - async def edge_degree(self, src_id: str, tgt_id: str) -> int: - raise NotImplementedError - - async def get_node(self, node_id: str) -> Union[dict, None]: - raise NotImplementedError - - async def get_edge( - self, source_node_id: str, target_node_id: str - ) -> Union[dict, None]: - raise NotImplementedError - - async def get_node_edges( - self, source_node_id: str - ) -> Union[list[tuple[str, str]], None]: - raise NotImplementedError - - async def upsert_node(self, node_id: str, node_data: dict[str, str]): - raise NotImplementedError - - async def upsert_edge( - self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] - ): - raise NotImplementedError - - async def clustering(self, algorithm: str): - raise NotImplementedError - - async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: - raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py deleted file mode 100644 index 836fda9ec..000000000 --- a/lightrag/lightrag.py +++ /dev/null @@ -1,300 +0,0 @@ -import asyncio -import os -from dataclasses import asdict, dataclass, field -from datetime import datetime -from functools import partial -from typing import Type, cast - -from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding -from .operate import ( - chunking_by_token_size, - extract_entities, - local_query, - global_query, - hybird_query, - naive_query, -) - -from .storage import ( - JsonKVStorage, - NanoVectorDBStorage, - NetworkXStorage, -) -from .utils import ( - EmbeddingFunc, - compute_mdhash_id, - limit_async_func_call, - convert_response_to_json, - logger, - set_logger, -) -from .base import ( - BaseGraphStorage, - BaseKVStorage, - BaseVectorStorage, - StorageNameSpace, - QueryParam, -) - -def always_get_an_event_loop() -> asyncio.AbstractEventLoop: - try: - # If there is already an event loop, use it. - loop = asyncio.get_event_loop() - except RuntimeError: - # If in a sub-thread, create a new event loop. - logger.info("Creating a new event loop in a sub-thread.") - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop - -@dataclass -class LightRAG: - working_dir: str = field( - default_factory=lambda: f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" - ) - - # text chunking - chunk_token_size: int = 1200 - chunk_overlap_token_size: int = 100 - tiktoken_model_name: str = "gpt-4o-mini" - - # entity extraction - entity_extract_max_gleaning: int = 1 - entity_summary_to_max_tokens: int = 500 - - # node embedding - node_embedding_algorithm: str = "node2vec" - node2vec_params: dict = field( - default_factory=lambda: { - "dimensions": 1536, - "num_walks": 10, - "walk_length": 40, - "num_walks": 10, - "window_size": 2, - "iterations": 3, - "random_seed": 3, - } - ) - - # text embedding - embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) - embedding_batch_num: int = 32 - embedding_func_max_async: int = 16 - - # LLM - llm_model_func: callable = gpt_4o_mini_complete - llm_model_max_token_size: int = 32768 - llm_model_max_async: int = 16 - - # storage - key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage - vector_db_storage_cls: Type[BaseVectorStorage] = NanoVectorDBStorage - vector_db_storage_cls_kwargs: dict = field(default_factory=dict) - graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage - enable_llm_cache: bool = True - - # extension - addon_params: dict = field(default_factory=dict) - convert_response_to_json_func: callable = convert_response_to_json - - def __post_init__(self): - log_file = os.path.join(self.working_dir, "lightrag.log") - set_logger(log_file) - logger.info(f"Logger initialized for working directory: {self.working_dir}") - - _print_config = ",\n ".join([f"{k} = {v}" for k, v in asdict(self).items()]) - logger.debug(f"LightRAG init with param:\n {_print_config}\n") - - if not os.path.exists(self.working_dir): - logger.info(f"Creating working directory {self.working_dir}") - os.makedirs(self.working_dir) - - self.full_docs = self.key_string_value_json_storage_cls( - namespace="full_docs", global_config=asdict(self) - ) - - self.text_chunks = self.key_string_value_json_storage_cls( - namespace="text_chunks", global_config=asdict(self) - ) - - self.llm_response_cache = ( - self.key_string_value_json_storage_cls( - namespace="llm_response_cache", global_config=asdict(self) - ) - if self.enable_llm_cache - else None - ) - self.chunk_entity_relation_graph = self.graph_storage_cls( - namespace="chunk_entity_relation", global_config=asdict(self) - ) - self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( - self.embedding_func - ) - self.entities_vdb = ( - self.vector_db_storage_cls( - namespace="entities", - global_config=asdict(self), - embedding_func=self.embedding_func, - meta_fields={"entity_name"} - ) - ) - self.relationships_vdb = ( - self.vector_db_storage_cls( - namespace="relationships", - global_config=asdict(self), - embedding_func=self.embedding_func, - meta_fields={"src_id", "tgt_id"} - ) - ) - self.chunks_vdb = ( - self.vector_db_storage_cls( - namespace="chunks", - global_config=asdict(self), - embedding_func=self.embedding_func, - ) - ) - - self.llm_model_func = limit_async_func_call(self.llm_model_max_async)( - partial(self.llm_model_func, hashing_kv=self.llm_response_cache) - ) - - def insert(self, string_or_strings): - loop = always_get_an_event_loop() - return loop.run_until_complete(self.ainsert(string_or_strings)) - - async def ainsert(self, string_or_strings): - try: - if isinstance(string_or_strings, str): - string_or_strings = [string_or_strings] - - new_docs = { - compute_mdhash_id(c.strip(), prefix="doc-"): {"content": c.strip()} - for c in string_or_strings - } - _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) - new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} - if not len(new_docs): - logger.warning(f"All docs are already in the storage") - return - logger.info(f"[New Docs] inserting {len(new_docs)} docs") - - inserting_chunks = {} - for doc_key, doc in new_docs.items(): - chunks = { - compute_mdhash_id(dp["content"], prefix="chunk-"): { - **dp, - "full_doc_id": doc_key, - } - for dp in chunking_by_token_size( - doc["content"], - overlap_token_size=self.chunk_overlap_token_size, - max_token_size=self.chunk_token_size, - tiktoken_model=self.tiktoken_model_name, - ) - } - inserting_chunks.update(chunks) - _add_chunk_keys = await self.text_chunks.filter_keys( - list(inserting_chunks.keys()) - ) - inserting_chunks = { - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - } - if not len(inserting_chunks): - logger.warning(f"All chunks are already in the storage") - return - logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") - - await self.chunks_vdb.upsert(inserting_chunks) - - logger.info("[Entity Extraction]...") - maybe_new_kg = await extract_entities( - inserting_chunks, - knwoledge_graph_inst=self.chunk_entity_relation_graph, - entity_vdb=self.entities_vdb, - relationships_vdb=self.relationships_vdb, - global_config=asdict(self), - ) - if maybe_new_kg is None: - logger.warning("No new entities and relationships found") - return - self.chunk_entity_relation_graph = maybe_new_kg - - await self.full_docs.upsert(new_docs) - await self.text_chunks.upsert(inserting_chunks) - finally: - await self._insert_done() - - async def _insert_done(self): - tasks = [] - for storage_inst in [ - self.full_docs, - self.text_chunks, - self.llm_response_cache, - self.entities_vdb, - self.relationships_vdb, - self.chunks_vdb, - self.chunk_entity_relation_graph, - ]: - if storage_inst is None: - continue - tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) - await asyncio.gather(*tasks) - - def query(self, query: str, param: QueryParam = QueryParam()): - loop = always_get_an_event_loop() - return loop.run_until_complete(self.aquery(query, param)) - - async def aquery(self, query: str, param: QueryParam = QueryParam()): - if param.mode == "local": - response = await local_query( - query, - self.chunk_entity_relation_graph, - self.entities_vdb, - self.relationships_vdb, - self.text_chunks, - param, - asdict(self), - ) - elif param.mode == "global": - response = await global_query( - query, - self.chunk_entity_relation_graph, - self.entities_vdb, - self.relationships_vdb, - self.text_chunks, - param, - asdict(self), - ) - elif param.mode == "hybird": - response = await hybird_query( - query, - self.chunk_entity_relation_graph, - self.entities_vdb, - self.relationships_vdb, - self.text_chunks, - param, - asdict(self), - ) - elif param.mode == "naive": - response = await naive_query( - query, - self.chunks_vdb, - self.text_chunks, - param, - asdict(self), - ) - else: - raise ValueError(f"Unknown mode {param.mode}") - await self._query_done() - return response - - - async def _query_done(self): - tasks = [] - for storage_inst in [self.llm_response_cache]: - if storage_inst is None: - continue - tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) - await asyncio.gather(*tasks) - - diff --git a/lightrag/llm.py b/lightrag/llm.py deleted file mode 100644 index ee700a104..000000000 --- a/lightrag/llm.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -import numpy as np -from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout -from tenacity import ( - retry, - stop_after_attempt, - wait_exponential, - retry_if_exception_type, -) - -from .base import BaseKVStorage -from .utils import compute_args_hash, wrap_embedding_func_with_attrs - -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), -) -async def openai_complete_if_cache( - model, prompt, system_prompt=None, history_messages=[], **kwargs -) -> str: - openai_async_client = AsyncOpenAI() - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) - messages = [] - if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - messages.extend(history_messages) - messages.append({"role": "user", "content": prompt}) - if hashing_kv is not None: - args_hash = compute_args_hash(model, messages) - if_cache_return = await hashing_kv.get_by_id(args_hash) - if if_cache_return is not None: - return if_cache_return["return"] - - response = await openai_async_client.chat.completions.create( - model=model, messages=messages, **kwargs - ) - - if hashing_kv is not None: - await hashing_kv.upsert( - {args_hash: {"return": response.choices[0].message.content, "model": model}} - ) - return response.choices[0].message.content - -async def gpt_4o_complete( - prompt, system_prompt=None, history_messages=[], **kwargs -) -> str: - return await openai_complete_if_cache( - "gpt-4o", - prompt, - system_prompt=system_prompt, - history_messages=history_messages, - **kwargs, - ) - - -async def gpt_4o_mini_complete( - prompt, system_prompt=None, history_messages=[], **kwargs -) -> str: - return await openai_complete_if_cache( - "gpt-4o-mini", - prompt, - system_prompt=system_prompt, - history_messages=history_messages, - **kwargs, - ) - -@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), -) -async def openai_embedding(texts: list[str]) -> np.ndarray: - openai_async_client = AsyncOpenAI() - response = await openai_async_client.embeddings.create( - model="text-embedding-3-small", input=texts, encoding_format="float" - ) - return np.array([dp.embedding for dp in response.data]) - -if __name__ == "__main__": - import asyncio - - async def main(): - result = await gpt_4o_mini_complete('How are you?') - print(result) - - asyncio.run(main()) diff --git a/lightrag/operate.py b/lightrag/operate.py deleted file mode 100644 index 2d3271da8..000000000 --- a/lightrag/operate.py +++ /dev/null @@ -1,944 +0,0 @@ -import asyncio -import json -import re -from typing import Union -from collections import Counter, defaultdict - -from .utils import ( - logger, - clean_str, - compute_mdhash_id, - decode_tokens_by_tiktoken, - encode_string_by_tiktoken, - is_float_regex, - list_of_list_to_csv, - pack_user_ass_to_openai_messages, - split_string_by_multi_markers, - truncate_list_by_token_size, -) -from .base import ( - BaseGraphStorage, - BaseKVStorage, - BaseVectorStorage, - TextChunkSchema, - QueryParam, -) -from .prompt import GRAPH_FIELD_SEP, PROMPTS - -def chunking_by_token_size( - content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" -): - tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) - results = [] - for index, start in enumerate( - range(0, len(tokens), max_token_size - overlap_token_size) - ): - chunk_content = decode_tokens_by_tiktoken( - tokens[start : start + max_token_size], model_name=tiktoken_model - ) - results.append( - { - "tokens": min(max_token_size, len(tokens) - start), - "content": chunk_content.strip(), - "chunk_order_index": index, - } - ) - return results - -async def _handle_entity_relation_summary( - entity_or_relation_name: str, - description: str, - global_config: dict, -) -> str: - use_llm_func: callable = global_config["llm_model_func"] - llm_max_tokens = global_config["llm_model_max_token_size"] - tiktoken_model_name = global_config["tiktoken_model_name"] - summary_max_tokens = global_config["entity_summary_to_max_tokens"] - - tokens = encode_string_by_tiktoken(description, model_name=tiktoken_model_name) - if len(tokens) < summary_max_tokens: # No need for summary - return description - prompt_template = PROMPTS["summarize_entity_descriptions"] - use_description = decode_tokens_by_tiktoken( - tokens[:llm_max_tokens], model_name=tiktoken_model_name - ) - context_base = dict( - entity_name=entity_or_relation_name, - description_list=use_description.split(GRAPH_FIELD_SEP), - ) - use_prompt = prompt_template.format(**context_base) - logger.debug(f"Trigger summary: {entity_or_relation_name}") - summary = await use_llm_func(use_prompt, max_tokens=summary_max_tokens) - return summary - - -async def _handle_single_entity_extraction( - record_attributes: list[str], - chunk_key: str, -): - if record_attributes[0] != '"entity"' or len(record_attributes) < 4: - return None - # add this record as a node in the G - entity_name = clean_str(record_attributes[1].upper()) - if not entity_name.strip(): - return None - entity_type = clean_str(record_attributes[2].upper()) - entity_description = clean_str(record_attributes[3]) - entity_source_id = chunk_key - return dict( - entity_name=entity_name, - entity_type=entity_type, - description=entity_description, - source_id=entity_source_id, - ) - - -async def _handle_single_relationship_extraction( - record_attributes: list[str], - chunk_key: str, -): - if record_attributes[0] != '"relationship"' or len(record_attributes) < 5: - return None - # add this record as edge - source = clean_str(record_attributes[1].upper()) - target = clean_str(record_attributes[2].upper()) - edge_description = clean_str(record_attributes[3]) - - edge_keywords = clean_str(record_attributes[4]) - edge_source_id = chunk_key - weight = ( - float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0 - ) - return dict( - src_id=source, - tgt_id=target, - weight=weight, - description=edge_description, - keywords=edge_keywords, - source_id=edge_source_id, - ) - - -async def _merge_nodes_then_upsert( - entity_name: str, - nodes_data: list[dict], - knwoledge_graph_inst: BaseGraphStorage, - global_config: dict, -): - already_entitiy_types = [] - already_source_ids = [] - already_description = [] - - already_node = await knwoledge_graph_inst.get_node(entity_name) - if already_node is not None: - already_entitiy_types.append(already_node["entity_type"]) - already_source_ids.extend( - split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP]) - ) - already_description.append(already_node["description"]) - - entity_type = sorted( - Counter( - [dp["entity_type"] for dp in nodes_data] + already_entitiy_types - ).items(), - key=lambda x: x[1], - reverse=True, - )[0][0] - description = GRAPH_FIELD_SEP.join( - sorted(set([dp["description"] for dp in nodes_data] + already_description)) - ) - source_id = GRAPH_FIELD_SEP.join( - set([dp["source_id"] for dp in nodes_data] + already_source_ids) - ) - description = await _handle_entity_relation_summary( - entity_name, description, global_config - ) - node_data = dict( - entity_type=entity_type, - description=description, - source_id=source_id, - ) - await knwoledge_graph_inst.upsert_node( - entity_name, - node_data=node_data, - ) - node_data["entity_name"] = entity_name - return node_data - - -async def _merge_edges_then_upsert( - src_id: str, - tgt_id: str, - edges_data: list[dict], - knwoledge_graph_inst: BaseGraphStorage, - global_config: dict, -): - already_weights = [] - already_source_ids = [] - already_description = [] - already_keywords = [] - - if await knwoledge_graph_inst.has_edge(src_id, tgt_id): - already_edge = await knwoledge_graph_inst.get_edge(src_id, tgt_id) - already_weights.append(already_edge["weight"]) - already_source_ids.extend( - split_string_by_multi_markers(already_edge["source_id"], [GRAPH_FIELD_SEP]) - ) - already_description.append(already_edge["description"]) - already_keywords.extend( - split_string_by_multi_markers(already_edge["keywords"], [GRAPH_FIELD_SEP]) - ) - - weight = sum([dp["weight"] for dp in edges_data] + already_weights) - description = GRAPH_FIELD_SEP.join( - sorted(set([dp["description"] for dp in edges_data] + already_description)) - ) - keywords = GRAPH_FIELD_SEP.join( - sorted(set([dp["keywords"] for dp in edges_data] + already_keywords)) - ) - source_id = GRAPH_FIELD_SEP.join( - set([dp["source_id"] for dp in edges_data] + already_source_ids) - ) - for need_insert_id in [src_id, tgt_id]: - if not (await knwoledge_graph_inst.has_node(need_insert_id)): - await knwoledge_graph_inst.upsert_node( - need_insert_id, - node_data={ - "source_id": source_id, - "description": description, - "entity_type": '"UNKNOWN"', - }, - ) - description = await _handle_entity_relation_summary( - (src_id, tgt_id), description, global_config - ) - await knwoledge_graph_inst.upsert_edge( - src_id, - tgt_id, - edge_data=dict( - weight=weight, - description=description, - keywords=keywords, - source_id=source_id, - ), - ) - - edge_data = dict( - src_id=src_id, - tgt_id=tgt_id, - description=description, - keywords=keywords, - ) - - return edge_data - -async def extract_entities( - chunks: dict[str, TextChunkSchema], - knwoledge_graph_inst: BaseGraphStorage, - entity_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - global_config: dict, -) -> Union[BaseGraphStorage, None]: - use_llm_func: callable = global_config["llm_model_func"] - entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] - - ordered_chunks = list(chunks.items()) - - entity_extract_prompt = PROMPTS["entity_extraction"] - context_base = dict( - tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], - record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"], - completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"], - entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]), - ) - continue_prompt = PROMPTS["entiti_continue_extraction"] - if_loop_prompt = PROMPTS["entiti_if_loop_extraction"] - - already_processed = 0 - already_entities = 0 - already_relations = 0 - - async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): - nonlocal already_processed, already_entities, already_relations - chunk_key = chunk_key_dp[0] - chunk_dp = chunk_key_dp[1] - content = chunk_dp["content"] - hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) - final_result = await use_llm_func(hint_prompt) - - history = pack_user_ass_to_openai_messages(hint_prompt, final_result) - for now_glean_index in range(entity_extract_max_gleaning): - glean_result = await use_llm_func(continue_prompt, history_messages=history) - - history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) - final_result += glean_result - if now_glean_index == entity_extract_max_gleaning - 1: - break - - if_loop_result: str = await use_llm_func( - if_loop_prompt, history_messages=history - ) - if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() - if if_loop_result != "yes": - break - - records = split_string_by_multi_markers( - final_result, - [context_base["record_delimiter"], context_base["completion_delimiter"]], - ) - - maybe_nodes = defaultdict(list) - maybe_edges = defaultdict(list) - for record in records: - record = re.search(r"\((.*)\)", record) - if record is None: - continue - record = record.group(1) - record_attributes = split_string_by_multi_markers( - record, [context_base["tuple_delimiter"]] - ) - if_entities = await _handle_single_entity_extraction( - record_attributes, chunk_key - ) - if if_entities is not None: - maybe_nodes[if_entities["entity_name"]].append(if_entities) - continue - - if_relation = await _handle_single_relationship_extraction( - record_attributes, chunk_key - ) - if if_relation is not None: - maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append( - if_relation - ) - already_processed += 1 - already_entities += len(maybe_nodes) - already_relations += len(maybe_edges) - now_ticks = PROMPTS["process_tickers"][ - already_processed % len(PROMPTS["process_tickers"]) - ] - print( - f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", - end="", - flush=True, - ) - return dict(maybe_nodes), dict(maybe_edges) - - # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings - results = await asyncio.gather( - *[_process_single_content(c) for c in ordered_chunks] - ) - print() # clear the progress bar - maybe_nodes = defaultdict(list) - maybe_edges = defaultdict(list) - for m_nodes, m_edges in results: - for k, v in m_nodes.items(): - maybe_nodes[k].extend(v) - for k, v in m_edges.items(): - maybe_edges[tuple(sorted(k))].extend(v) - all_entities_data = await asyncio.gather( - *[ - _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config) - for k, v in maybe_nodes.items() - ] - ) - all_relationships_data = await asyncio.gather( - *[ - _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config) - for k, v in maybe_edges.items() - ] - ) - if not len(all_entities_data): - logger.warning("Didn't extract any entities, maybe your LLM is not working") - return None - if not len(all_relationships_data): - logger.warning("Didn't extract any relationships, maybe your LLM is not working") - return None - - if entity_vdb is not None: - data_for_vdb = { - compute_mdhash_id(dp["entity_name"], prefix="ent-"): { - "content": dp["entity_name"] + dp["description"], - "entity_name": dp["entity_name"], - } - for dp in all_entities_data - } - await entity_vdb.upsert(data_for_vdb) - - if relationships_vdb is not None: - data_for_vdb = { - compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { - "src_id": dp["src_id"], - "tgt_id": dp["tgt_id"], - "content": dp["keywords"] + dp["src_id"] + dp["tgt_id"] + dp["description"], - } - for dp in all_relationships_data - } - await relationships_vdb.upsert(data_for_vdb) - - return knwoledge_graph_inst - -async def local_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, -) -> str: - use_model_func = global_config["llm_model_func"] - - kw_prompt_temp = PROMPTS["keywords_extraction"] - kw_prompt = kw_prompt_temp.format(query=query) - result = await use_model_func(kw_prompt) - - try: - keywords_data = json.loads(result) - keywords = keywords_data.get("low_level_keywords", []) - keywords = ', '.join(keywords) - except json.JSONDecodeError as e: - # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - - context = await _build_local_query_context( - keywords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) - if query_param.only_need_context: - return context - if context is None: - return PROMPTS["fail_response"] - sys_prompt_temp = PROMPTS["rag_response"] - sys_prompt = sys_prompt_temp.format( - context_data=context, response_type=query_param.response_type - ) - response = await use_model_func( - query, - system_prompt=sys_prompt, - ) - return response - -async def _build_local_query_context( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, -): - results = await entities_vdb.query(query, top_k=query_param.top_k) - if not len(results): - return None - node_datas = await asyncio.gather( - *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results] - ) - if not all([n is not None for n in node_datas]): - logger.warning("Some nodes are missing, maybe the storage is damaged") - node_degrees = await asyncio.gather( - *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results] - ) - node_datas = [ - {**n, "entity_name": k["entity_name"], "rank": d} - for k, n, d in zip(results, node_datas, node_degrees) - if n is not None - ] - use_text_units = await _find_most_related_text_unit_from_entities( - node_datas, query_param, text_chunks_db, knowledge_graph_inst - ) - use_relations = await _find_most_related_edges_from_entities( - node_datas, query_param, knowledge_graph_inst - ) - logger.info( - f"Local query uses {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} text units" - ) - entites_section_list = [["id", "entity", "type", "description", "rank"]] - for i, n in enumerate(node_datas): - entites_section_list.append( - [ - i, - n["entity_name"], - n.get("entity_type", "UNKNOWN"), - n.get("description", "UNKNOWN"), - n["rank"], - ] - ) - entities_context = list_of_list_to_csv(entites_section_list) - - relations_section_list = [ - ["id", "source", "target", "description", "keywords", "weight", "rank"] - ] - for i, e in enumerate(use_relations): - relations_section_list.append( - [ - i, - e["src_tgt"][0], - e["src_tgt"][1], - e["description"], - e["keywords"], - e["weight"], - e["rank"], - ] - ) - relations_context = list_of_list_to_csv(relations_section_list) - - text_units_section_list = [["id", "content"]] - for i, t in enumerate(use_text_units): - text_units_section_list.append([i, t["content"]]) - text_units_context = list_of_list_to_csv(text_units_section_list) - return f""" ------Entities----- -```csv -{entities_context} -``` ------Relationships----- -```csv -{relations_context} -``` ------Sources----- -```csv -{text_units_context} -``` -""" - -async def _find_most_related_text_unit_from_entities( - node_datas: list[dict], - query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, -): - text_units = [ - split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) - for dp in node_datas - ] - edges = await asyncio.gather( - *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] - ) - all_one_hop_nodes = set() - for this_edges in edges: - if not this_edges: - continue - all_one_hop_nodes.update([e[1] for e in this_edges]) - all_one_hop_nodes = list(all_one_hop_nodes) - all_one_hop_nodes_data = await asyncio.gather( - *[knowledge_graph_inst.get_node(e) for e in all_one_hop_nodes] - ) - all_one_hop_text_units_lookup = { - k: set(split_string_by_multi_markers(v["source_id"], [GRAPH_FIELD_SEP])) - for k, v in zip(all_one_hop_nodes, all_one_hop_nodes_data) - if v is not None - } - all_text_units_lookup = {} - for index, (this_text_units, this_edges) in enumerate(zip(text_units, edges)): - for c_id in this_text_units: - if c_id in all_text_units_lookup: - continue - relation_counts = 0 - for e in this_edges: - if ( - e[1] in all_one_hop_text_units_lookup - and c_id in all_one_hop_text_units_lookup[e[1]] - ): - relation_counts += 1 - all_text_units_lookup[c_id] = { - "data": await text_chunks_db.get_by_id(c_id), - "order": index, - "relation_counts": relation_counts, - } - if any([v is None for v in all_text_units_lookup.values()]): - logger.warning("Text chunks are missing, maybe the storage is damaged") - all_text_units = [ - {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None - ] - all_text_units = sorted( - all_text_units, key=lambda x: (x["order"], -x["relation_counts"]) - ) - all_text_units = truncate_list_by_token_size( - all_text_units, - key=lambda x: x["data"]["content"], - max_token_size=query_param.max_token_for_text_unit, - ) - all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] - return all_text_units - -async def _find_most_related_edges_from_entities( - node_datas: list[dict], - query_param: QueryParam, - knowledge_graph_inst: BaseGraphStorage, -): - all_related_edges = await asyncio.gather( - *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] - ) - all_edges = set() - for this_edges in all_related_edges: - all_edges.update([tuple(sorted(e)) for e in this_edges]) - all_edges = list(all_edges) - all_edges_pack = await asyncio.gather( - *[knowledge_graph_inst.get_edge(e[0], e[1]) for e in all_edges] - ) - all_edges_degree = await asyncio.gather( - *[knowledge_graph_inst.edge_degree(e[0], e[1]) for e in all_edges] - ) - all_edges_data = [ - {"src_tgt": k, "rank": d, **v} - for k, v, d in zip(all_edges, all_edges_pack, all_edges_degree) - if v is not None - ] - all_edges_data = sorted( - all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True - ) - all_edges_data = truncate_list_by_token_size( - all_edges_data, - key=lambda x: x["description"], - max_token_size=query_param.max_token_for_global_context, - ) - return all_edges_data - -async def global_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, -) -> str: - use_model_func = global_config["llm_model_func"] - - kw_prompt_temp = PROMPTS["keywords_extraction"] - kw_prompt = kw_prompt_temp.format(query=query) - result = await use_model_func(kw_prompt) - - try: - keywords_data = json.loads(result) - keywords = keywords_data.get("high_level_keywords", []) - keywords = ', '.join(keywords) - except json.JSONDecodeError as e: - # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - - context = await _build_global_query_context( - keywords, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) - - if query_param.only_need_context: - return context - if context is None: - return PROMPTS["fail_response"] - - sys_prompt_temp = PROMPTS["rag_response"] - sys_prompt = sys_prompt_temp.format( - context_data=context, response_type=query_param.response_type - ) - response = await use_model_func( - query, - system_prompt=sys_prompt, - ) - return response - -async def _build_global_query_context( - keywords, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, -): - results = await relationships_vdb.query(keywords, top_k=query_param.top_k) - - if not len(results): - return None - - edge_datas = await asyncio.gather( - *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] - ) - - if not all([n is not None for n in edge_datas]): - logger.warning("Some edges are missing, maybe the storage is damaged") - edge_degree = await asyncio.gather( - *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results] - ) - edge_datas = [ - {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v} - for k, v, d in zip(results, edge_datas, edge_degree) - if v is not None - ] - edge_datas = sorted( - edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True - ) - edge_datas = truncate_list_by_token_size( - edge_datas, - key=lambda x: x["description"], - max_token_size=query_param.max_token_for_global_context, - ) - - use_entities = await _find_most_related_entities_from_relationships( - edge_datas, query_param, knowledge_graph_inst - ) - use_text_units = await _find_related_text_unit_from_relationships( - edge_datas, query_param, text_chunks_db, knowledge_graph_inst - ) - logger.info( - f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} text units" - ) - relations_section_list = [ - ["id", "source", "target", "description", "keywords", "weight", "rank"] - ] - for i, e in enumerate(edge_datas): - relations_section_list.append( - [ - i, - e["src_id"], - e["tgt_id"], - e["description"], - e["keywords"], - e["weight"], - e["rank"], - ] - ) - relations_context = list_of_list_to_csv(relations_section_list) - - entites_section_list = [["id", "entity", "type", "description", "rank"]] - for i, n in enumerate(use_entities): - entites_section_list.append( - [ - i, - n["entity_name"], - n.get("entity_type", "UNKNOWN"), - n.get("description", "UNKNOWN"), - n["rank"], - ] - ) - entities_context = list_of_list_to_csv(entites_section_list) - - text_units_section_list = [["id", "content"]] - for i, t in enumerate(use_text_units): - text_units_section_list.append([i, t["content"]]) - text_units_context = list_of_list_to_csv(text_units_section_list) - - return f""" ------Entities----- -```csv -{entities_context} -``` ------Relationships----- -```csv -{relations_context} -``` ------Sources----- -```csv -{text_units_context} -``` -""" - -async def _find_most_related_entities_from_relationships( - edge_datas: list[dict], - query_param: QueryParam, - knowledge_graph_inst: BaseGraphStorage, -): - entity_names = set() - for e in edge_datas: - entity_names.add(e["src_id"]) - entity_names.add(e["tgt_id"]) - - node_datas = await asyncio.gather( - *[knowledge_graph_inst.get_node(entity_name) for entity_name in entity_names] - ) - - node_degrees = await asyncio.gather( - *[knowledge_graph_inst.node_degree(entity_name) for entity_name in entity_names] - ) - node_datas = [ - {**n, "entity_name": k, "rank": d} - for k, n, d in zip(entity_names, node_datas, node_degrees) - ] - - node_datas = truncate_list_by_token_size( - node_datas, - key=lambda x: x["description"], - max_token_size=query_param.max_token_for_local_context, - ) - - return node_datas - -async def _find_related_text_unit_from_relationships( - edge_datas: list[dict], - query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, -): - - text_units = [ - split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) - for dp in edge_datas - ] - - all_text_units_lookup = {} - - for index, unit_list in enumerate(text_units): - for c_id in unit_list: - if c_id not in all_text_units_lookup: - all_text_units_lookup[c_id] = { - "data": await text_chunks_db.get_by_id(c_id), - "order": index, - } - - if any([v is None for v in all_text_units_lookup.values()]): - logger.warning("Text chunks are missing, maybe the storage is damaged") - all_text_units = [ - {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None - ] - all_text_units = sorted( - all_text_units, key=lambda x: x["order"] - ) - all_text_units = truncate_list_by_token_size( - all_text_units, - key=lambda x: x["data"]["content"], - max_token_size=query_param.max_token_for_text_unit, - ) - all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] - - return all_text_units - -async def hybird_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, -) -> str: - use_model_func = global_config["llm_model_func"] - - kw_prompt_temp = PROMPTS["keywords_extraction"] - kw_prompt = kw_prompt_temp.format(query=query) - result = await use_model_func(kw_prompt) - - try: - keywords_data = json.loads(result) - hl_keywords = keywords_data.get("high_level_keywords", []) - ll_keywords = keywords_data.get("low_level_keywords", []) - hl_keywords = ', '.join(hl_keywords) - ll_keywords = ', '.join(ll_keywords) - except json.JSONDecodeError as e: - # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - - low_level_context = await _build_local_query_context( - ll_keywords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) - - high_level_context = await _build_global_query_context( - hl_keywords, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) - - context = combine_contexts(high_level_context, low_level_context) - - if query_param.only_need_context: - return context - if context is None: - return PROMPTS["fail_response"] - - sys_prompt_temp = PROMPTS["rag_response"] - sys_prompt = sys_prompt_temp.format( - context_data=context, response_type=query_param.response_type - ) - response = await use_model_func( - query, - system_prompt=sys_prompt, - ) - return response - -def combine_contexts(high_level_context, low_level_context): - # Function to extract entities, relationships, and sources from context strings - def extract_sections(context): - entities_match = re.search(r'-----Entities-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - relationships_match = re.search(r'-----Relationships-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - sources_match = re.search(r'-----Sources-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - - entities = entities_match.group(1) if entities_match else '' - relationships = relationships_match.group(1) if relationships_match else '' - sources = sources_match.group(1) if sources_match else '' - - return entities, relationships, sources - - # Extract sections from both contexts - hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) - ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) - - # Combine and deduplicate the entities - combined_entities_set = set(filter(None, hl_entities.strip().split('\n') + ll_entities.strip().split('\n'))) - combined_entities = '\n'.join(combined_entities_set) - - # Combine and deduplicate the relationships - combined_relationships_set = set(filter(None, hl_relationships.strip().split('\n') + ll_relationships.strip().split('\n'))) - combined_relationships = '\n'.join(combined_relationships_set) - - # Combine and deduplicate the sources - combined_sources_set = set(filter(None, hl_sources.strip().split('\n') + ll_sources.strip().split('\n'))) - combined_sources = '\n'.join(combined_sources_set) - - # Format the combined context - return f""" ------Entities----- -```csv -{combined_entities} ------Relationships----- -{combined_relationships} ------Sources----- -{combined_sources} -""" - -async def naive_query( - query, - chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, -): - use_model_func = global_config["llm_model_func"] - results = await chunks_vdb.query(query, top_k=query_param.top_k) - if not len(results): - return PROMPTS["fail_response"] - chunks_ids = [r["id"] for r in results] - chunks = await text_chunks_db.get_by_ids(chunks_ids) - - maybe_trun_chunks = truncate_list_by_token_size( - chunks, - key=lambda x: x["content"], - max_token_size=query_param.max_token_for_text_unit, - ) - logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks") - section = "--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks]) - if query_param.only_need_context: - return section - sys_prompt_temp = PROMPTS["naive_rag_response"] - sys_prompt = sys_prompt_temp.format( - content_data=section, response_type=query_param.response_type - ) - response = await use_model_func( - query, - system_prompt=sys_prompt, - ) - return response - diff --git a/lightrag/prompt.py b/lightrag/prompt.py deleted file mode 100644 index 5d28e49c5..000000000 --- a/lightrag/prompt.py +++ /dev/null @@ -1,256 +0,0 @@ -GRAPH_FIELD_SEP = "" - -PROMPTS = {} - -PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" -PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" -PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" -PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] - -PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] - -PROMPTS[ - "entity_extraction" -] = """-Goal- -Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. - --Steps- -1. Identify all entities. For each identified entity, extract the following information: -- entity_name: Name of the entity, capitalized -- entity_type: One of the following types: [{entity_types}] -- entity_description: Comprehensive description of the entity's attributes and activities -Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} - -2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. -For each pair of related entities, extract the following information: -- source_entity: name of the source entity, as identified in step 1 -- target_entity: name of the target entity, as identified in step 1 -- relationship_description: explanation as to why you think the source entity and the target entity are related to each other -- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity -- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details -Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. -Format the content-level key words as ("content_keywords"{tuple_delimiter}) - -4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. - -5. When finished, output {completion_delimiter} - -###################### --Examples- -###################### -Example 1: - -Entity_types: [person, technology, mission, organization, location] -Text: -while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. - -Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” - -The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. - -It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths -################ -Output: -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} -("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} -("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} -("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} -############################# -Example 2: - -Entity_types: [person, technology, mission, organization, location] -Text: -They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. - -Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. - -Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly -############# -Output: -("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} -("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} -("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} -("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} -############################# -Example 3: - -Entity_types: [person, role, technology, organization, event, location, concept] -Text: -their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. - -"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." - -Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." - -Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. - -The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation -############# -Output: -("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} -("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} -("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} -("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} -("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} -("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} -("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} -############################# --Real Data- -###################### -Entity_types: {entity_types} -Text: {input_text} -###################### -Output: -""" - -PROMPTS[ - "summarize_entity_descriptions" -] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. -Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. -Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. -If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. -Make sure it is written in third person, and include the entity names so we the have full context. - -####### --Data- -Entities: {entity_name} -Description List: {description_list} -####### -Output: -""" - -PROMPTS[ - "entiti_continue_extraction" -] = """MANY entities were missed in the last extraction. Add them below using the same format: -""" - -PROMPTS[ - "entiti_if_loop_extraction" -] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. -""" - -PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." - -PROMPTS[ - "rag_response" -] = """---Role--- - -You are a helpful assistant responding to questions about data in the tables provided. - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. - ----Target response length and format--- - -{response_type} - - ----Data tables--- - -{context_data} - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. - -If you don't know the answer, just say so. Do not make anything up. - -Do not include information where the supporting evidence for it is not provided. - - ----Target response length and format--- - -{response_type} - -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" - -PROMPTS["keywords_extraction"] = """---Role--- - -You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. - ----Goal--- - -Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. - ----Instructions--- - -- Output the keywords in JSON format. -- The JSON should have two keys: - - "high_level_keywords" for overarching concepts or themes. - - "low_level_keywords" for specific entities or details. - -###################### --Examples- -###################### -Example 1: - -Query: "How does international trade influence global economic stability?" -################ -Output: -{{ - "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], - "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] -}} -############################# -Example 2: - -Query: "What are the environmental consequences of deforestation on biodiversity?" -################ -Output: -{{ - "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], - "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] -}} -############################# -Example 3: - -Query: "What is the role of education in reducing poverty?" -################ -Output: -{{ - "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], - "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] -}} -############################# --Real Data- -###################### -Query: {query} -###################### -Output: - -""" - -PROMPTS[ - "naive_rag_response" -] = """You're a helpful assistant -Below are the knowledge you know: -{content_data} ---- -If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. ----Target response length and format--- -{response_type} -""" diff --git a/lightrag/storage.py b/lightrag/storage.py deleted file mode 100644 index 2f2bb7d8f..000000000 --- a/lightrag/storage.py +++ /dev/null @@ -1,246 +0,0 @@ -import asyncio -import html -import json -import os -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any, Union, cast -import pickle -import hnswlib -import networkx as nx -import numpy as np -from nano_vectordb import NanoVectorDB -import xxhash - -from .utils import load_json, logger, write_json -from .base import ( - BaseGraphStorage, - BaseKVStorage, - BaseVectorStorage, -) - -@dataclass -class JsonKVStorage(BaseKVStorage): - def __post_init__(self): - working_dir = self.global_config["working_dir"] - self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") - self._data = load_json(self._file_name) or {} - logger.info(f"Load KV {self.namespace} with {len(self._data)} data") - - async def all_keys(self) -> list[str]: - return list(self._data.keys()) - - async def index_done_callback(self): - write_json(self._data, self._file_name) - - async def get_by_id(self, id): - return self._data.get(id, None) - - async def get_by_ids(self, ids, fields=None): - if fields is None: - return [self._data.get(id, None) for id in ids] - return [ - ( - {k: v for k, v in self._data[id].items() if k in fields} - if self._data.get(id, None) - else None - ) - for id in ids - ] - - async def filter_keys(self, data: list[str]) -> set[str]: - return set([s for s in data if s not in self._data]) - - async def upsert(self, data: dict[str, dict]): - left_data = {k: v for k, v in data.items() if k not in self._data} - self._data.update(left_data) - return left_data - - async def drop(self): - self._data = {} - -@dataclass -class NanoVectorDBStorage(BaseVectorStorage): - cosine_better_than_threshold: float = 0.2 - - def __post_init__(self): - - self._client_file_name = os.path.join( - self.global_config["working_dir"], f"vdb_{self.namespace}.json" - ) - self._max_batch_size = self.global_config["embedding_batch_num"] - self._client = NanoVectorDB( - self.embedding_func.embedding_dim, storage_file=self._client_file_name - ) - self.cosine_better_than_threshold = self.global_config.get( - "cosine_better_than_threshold", self.cosine_better_than_threshold - ) - - async def upsert(self, data: dict[str, dict]): - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] - list_data = [ - { - "__id__": k, - **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, - } - for k, v in data.items() - ] - contents = [v["content"] for v in data.values()] - batches = [ - contents[i : i + self._max_batch_size] - for i in range(0, len(contents), self._max_batch_size) - ] - embeddings_list = await asyncio.gather( - *[self.embedding_func(batch) for batch in batches] - ) - embeddings = np.concatenate(embeddings_list) - for i, d in enumerate(list_data): - d["__vector__"] = embeddings[i] - results = self._client.upsert(datas=list_data) - return results - - async def query(self, query: str, top_k=5): - embedding = await self.embedding_func([query]) - embedding = embedding[0] - results = self._client.query( - query=embedding, - top_k=top_k, - better_than_threshold=self.cosine_better_than_threshold, - ) - results = [ - {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results - ] - return results - - async def index_done_callback(self): - self._client.save() - -@dataclass -class NetworkXStorage(BaseGraphStorage): - @staticmethod - def load_nx_graph(file_name) -> nx.Graph: - if os.path.exists(file_name): - return nx.read_graphml(file_name) - return None - - @staticmethod - def write_nx_graph(graph: nx.Graph, file_name): - logger.info( - f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges" - ) - nx.write_graphml(graph, file_name) - - @staticmethod - def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: - """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py - Return the largest connected component of the graph, with nodes and edges sorted in a stable way. - """ - from graspologic.utils import largest_connected_component - - graph = graph.copy() - graph = cast(nx.Graph, largest_connected_component(graph)) - node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore - graph = nx.relabel_nodes(graph, node_mapping) - return NetworkXStorage._stabilize_graph(graph) - - @staticmethod - def _stabilize_graph(graph: nx.Graph) -> nx.Graph: - """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py - Ensure an undirected graph with the same relationships will always be read the same way. - """ - fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph() - - sorted_nodes = graph.nodes(data=True) - sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0]) - - fixed_graph.add_nodes_from(sorted_nodes) - edges = list(graph.edges(data=True)) - - if not graph.is_directed(): - - def _sort_source_target(edge): - source, target, edge_data = edge - if source > target: - temp = source - source = target - target = temp - return source, target, edge_data - - edges = [_sort_source_target(edge) for edge in edges] - - def _get_edge_key(source: Any, target: Any) -> str: - return f"{source} -> {target}" - - edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1])) - - fixed_graph.add_edges_from(edges) - return fixed_graph - - def __post_init__(self): - self._graphml_xml_file = os.path.join( - self.global_config["working_dir"], f"graph_{self.namespace}.graphml" - ) - preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) - if preloaded_graph is not None: - logger.info( - f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges" - ) - self._graph = preloaded_graph or nx.Graph() - self._node_embed_algorithms = { - "node2vec": self._node2vec_embed, - } - - async def index_done_callback(self): - NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) - - async def has_node(self, node_id: str) -> bool: - return self._graph.has_node(node_id) - - async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: - return self._graph.has_edge(source_node_id, target_node_id) - - async def get_node(self, node_id: str) -> Union[dict, None]: - return self._graph.nodes.get(node_id) - - async def node_degree(self, node_id: str) -> int: - return self._graph.degree(node_id) - - async def edge_degree(self, src_id: str, tgt_id: str) -> int: - return self._graph.degree(src_id) + self._graph.degree(tgt_id) - - async def get_edge( - self, source_node_id: str, target_node_id: str - ) -> Union[dict, None]: - return self._graph.edges.get((source_node_id, target_node_id)) - - async def get_node_edges(self, source_node_id: str): - if self._graph.has_node(source_node_id): - return list(self._graph.edges(source_node_id)) - return None - - async def upsert_node(self, node_id: str, node_data: dict[str, str]): - self._graph.add_node(node_id, **node_data) - - async def upsert_edge( - self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] - ): - self._graph.add_edge(source_node_id, target_node_id, **edge_data) - - async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: - if algorithm not in self._node_embed_algorithms: - raise ValueError(f"Node embedding algorithm {algorithm} not supported") - return await self._node_embed_algorithms[algorithm]() - - async def _node2vec_embed(self): - from graspologic import embed - - embeddings, nodes = embed.node2vec_embed( - self._graph, - **self.global_config["node2vec_params"], - ) - - nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] - return embeddings, nodes_ids diff --git a/lightrag/utils.py b/lightrag/utils.py deleted file mode 100644 index c75b4270c..000000000 --- a/lightrag/utils.py +++ /dev/null @@ -1,165 +0,0 @@ -import asyncio -import html -import json -import logging -import os -import re -from dataclasses import dataclass -from functools import wraps -from hashlib import md5 -from typing import Any, Union - -import numpy as np -import tiktoken - -ENCODER = None - -logger = logging.getLogger("lightrag") - -def set_logger(log_file: str): - logger.setLevel(logging.DEBUG) - - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(logging.DEBUG) - - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - file_handler.setFormatter(formatter) - - if not logger.handlers: - logger.addHandler(file_handler) - -@dataclass -class EmbeddingFunc: - embedding_dim: int - max_token_size: int - func: callable - - async def __call__(self, *args, **kwargs) -> np.ndarray: - return await self.func(*args, **kwargs) - -def locate_json_string_body_from_string(content: str) -> Union[str, None]: - """Locate the JSON string body from a string""" - maybe_json_str = re.search(r"{.*}", content, re.DOTALL) - if maybe_json_str is not None: - return maybe_json_str.group(0) - else: - return None - -def convert_response_to_json(response: str) -> dict: - json_str = locate_json_string_body_from_string(response) - assert json_str is not None, f"Unable to parse JSON from response: {response}" - try: - data = json.loads(json_str) - return data - except json.JSONDecodeError as e: - logger.error(f"Failed to parse JSON: {json_str}") - raise e from None - -def compute_args_hash(*args): - return md5(str(args).encode()).hexdigest() - -def compute_mdhash_id(content, prefix: str = ""): - return prefix + md5(content.encode()).hexdigest() - -def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): - """Add restriction of maximum async calling times for a async func""" - - def final_decro(func): - """Not using async.Semaphore to aovid use nest-asyncio""" - __current_size = 0 - - @wraps(func) - async def wait_func(*args, **kwargs): - nonlocal __current_size - while __current_size >= max_size: - await asyncio.sleep(waitting_time) - __current_size += 1 - result = await func(*args, **kwargs) - __current_size -= 1 - return result - - return wait_func - - return final_decro - -def wrap_embedding_func_with_attrs(**kwargs): - """Wrap a function with attributes""" - - def final_decro(func) -> EmbeddingFunc: - new_func = EmbeddingFunc(**kwargs, func=func) - return new_func - - return final_decro - -def load_json(file_name): - if not os.path.exists(file_name): - return None - with open(file_name) as f: - return json.load(f) - -def write_json(json_obj, file_name): - with open(file_name, "w") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False) - -def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): - global ENCODER - if ENCODER is None: - ENCODER = tiktoken.encoding_for_model(model_name) - tokens = ENCODER.encode(content) - return tokens - - -def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): - global ENCODER - if ENCODER is None: - ENCODER = tiktoken.encoding_for_model(model_name) - content = ENCODER.decode(tokens) - return content - -def pack_user_ass_to_openai_messages(*args: str): - roles = ["user", "assistant"] - return [ - {"role": roles[i % 2], "content": content} for i, content in enumerate(args) - ] - -def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: - """Split a string by multiple markers""" - if not markers: - return [content] - results = re.split("|".join(re.escape(marker) for marker in markers), content) - return [r.strip() for r in results if r.strip()] - -# Refer the utils functions of the official GraphRAG implementation: -# https://github.com/microsoft/graphrag -def clean_str(input: Any) -> str: - """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" - # If we get non-string input, just give it back - if not isinstance(input, str): - return input - - result = html.unescape(input.strip()) - # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python - return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) - -def is_float_regex(value): - return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) - -def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): - """Truncate a list of data by token size""" - if max_token_size <= 0: - return [] - tokens = 0 - for i, data in enumerate(list_data): - tokens += len(encode_string_by_tiktoken(key(data))) - if tokens > max_token_size: - return list_data[:i] - return list_data - -def list_of_list_to_csv(data: list[list]): - return "\n".join( - [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] - ) - -def save_data_to_file(data, file_name): - with open(file_name, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file From 30d54da6230d3b9ff51561cb2f374a5c29d10e28 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 10 Oct 2024 15:01:40 +0800 Subject: [PATCH 09/67] update --- README.md | 198 +++++++++ lightrag/__init__.py | 5 + lightrag/base.py | 116 ++++++ lightrag/lightrag.py | 300 ++++++++++++++ lightrag/llm.py | 88 ++++ lightrag/operate.py | 944 +++++++++++++++++++++++++++++++++++++++++++ lightrag/prompt.py | 256 ++++++++++++ lightrag/storage.py | 246 +++++++++++ lightrag/utils.py | 165 ++++++++ 9 files changed, 2318 insertions(+) create mode 100644 README.md create mode 100644 lightrag/__init__.py create mode 100644 lightrag/base.py create mode 100644 lightrag/lightrag.py create mode 100644 lightrag/llm.py create mode 100644 lightrag/operate.py create mode 100644 lightrag/prompt.py create mode 100644 lightrag/storage.py create mode 100644 lightrag/utils.py diff --git a/README.md b/README.md new file mode 100644 index 000000000..42de1c1cb --- /dev/null +++ b/README.md @@ -0,0 +1,198 @@ +# LightRAG: Simple and Fast Retrieval-Augmented Generation +![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) + + + + + + +This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). +![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) +## Install + +* Install from source + +```bash +cd LightRAG +pip install -e . +``` +* Install from PyPI +```bash +pip install lightrag-hku +``` + +## Quick Start + +* Set OpenAI API key in environment: `export OPENAI_API_KEY="sk-...".` +* Download the demo text "A Christmas Carol by Charles Dickens" +```bash +curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt +``` +Use the below python snippet: + +```python +from lightrag import LightRAG, QueryParam + +rag = LightRAG(working_dir="./dickens") + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybird search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybird"))) +``` +Batch Insert +```python +rag.insert(["TEXT1", "TEXT2",...]) +``` +Incremental Insert + +```python +rag = LightRAG(working_dir="./dickens") + +with open("./newText.txt") as f: + rag.insert(f.read()) +``` +## Evaluation +### Dataset +The dataset used in LightRAG can be download from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). + +### Generate Query +LightRAG uses the following prompt to generate high-level queries, with the corresponding code located in `example/generate_query.py`. +```python +Given the following description of a dataset: + +{description} + +Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. + +Output the results in the following structure: +- User 1: [user description] + - Task 1: [task description] + - Question 1: + - Question 2: + - Question 3: + - Question 4: + - Question 5: + - Task 2: [task description] + ... + - Task 5: [task description] +- User 2: [user description] + ... +- User 5: [user description] + ... +``` + + ### Batch Eval +To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`. +```python +---Role--- +You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. +---Goal--- +You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + +- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? +- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? +- **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? + +For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories. + +Here is the question: +{query} + +Here are the two answers: + +**Answer 1:** +{answer1} + +**Answer 2:** +{answer2} + +Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. + +Output your evaluation in the following JSON format: + +{{ + "Comprehensiveness": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation here]" + }}, + "Empowerment": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation here]" + }}, + "Overall Winner": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]" + }} +}} +``` +### Overall Performance Table +### Overall Performance Table +| | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | +|----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| +| | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | +| **Comprehensiveness** | 32.69% | **67.31%** | 35.44% | **64.56%** | 19.05% | **80.95%** | 36.36% | **63.64%** | +| **Diversity** | 24.09% | **75.91%** | 35.24% | **64.76%** | 10.98% | **89.02%** | 30.76% | **69.24%** | +| **Empowerment** | 31.35% | **68.65%** | 35.48% | **64.52%** | 17.59% | **82.41%** | 40.95% | **59.05%** | +| **Overall** | 33.30% | **66.70%** | 34.76% | **65.24%** | 17.46% | **82.54%** | 37.59% | **62.40%** | +| | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | +| **Comprehensiveness** | 32.05% | **67.95%** | 39.30% | **60.70%** | 18.57% | **81.43%** | 38.89% | **61.11%** | +| **Diversity** | 29.44% | **70.56%** | 38.71% | **61.29%** | 15.14% | **84.86%** | 28.50% | **71.50%** | +| **Empowerment** | 32.51% | **67.49%** | 37.52% | **62.48%** | 17.80% | **82.20%** | 43.96% | **56.04%** | +| **Overall** | 33.29% | **66.71%** | 39.03% | **60.97%** | 17.80% | **82.20%** | 39.61% | **60.39%** | +| | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | +| **Comprehensiveness** | 24.39% | **75.61%** | 36.49% | **63.51%** | 27.68% | **72.32%** | 42.17% | **57.83%** | +| **Diversity** | 24.96% | **75.34%** | 37.41% | **62.59%** | 18.79% | **81.21%** | 30.88% | **69.12%** | +| **Empowerment** | 24.89% | **75.11%** | 34.99% | **65.01%** | 26.99% | **73.01%** | **45.61%** | **54.39%** | +| **Overall** | 23.17% | **76.83%** | 35.67% | **64.33%** | 27.68% | **72.32%** | 42.72% | **57.28%** | +| | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | +| **Comprehensiveness** | 45.56% | **54.44%** | 45.98% | **54.02%** | 47.13% | **52.87%** | **51.86%** | 48.14% | +| **Diversity** | 19.65% | **80.35%** | 39.64% | **60.36%** | 25.55% | **74.45%** | 35.87% | **64.13%** | +| **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | +| **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | + +## Code Structure + +```python +. +├── examples +│ ├── batch_eval.py +│ ├── generate_query.py +│ ├── insert.py +│ └── query.py +├── lightrag +│ ├── __init__.py +│ ├── base.py +│ ├── lightrag.py +│ ├── llm.py +│ ├── operate.py +│ ├── prompt.py +│ ├── storage.py +│ └── utils.jpeg +├── LICENSE +├── README.md +├── requirements.txt +└── setup.py +``` +## Citation + +``` +@article{guo2024lightrag, +title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, +author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, +year={2024}, +eprint={2410.05779}, +archivePrefix={arXiv}, +primaryClass={cs.IR} +} +``` diff --git a/lightrag/__init__.py b/lightrag/__init__.py new file mode 100644 index 000000000..dc497cd44 --- /dev/null +++ b/lightrag/__init__.py @@ -0,0 +1,5 @@ +from .lightrag import LightRAG, QueryParam + +__version__ = "0.0.2" +__author__ = "Zirui Guo" +__url__ = "https://github.com/HKUDS/GraphEdit" diff --git a/lightrag/base.py b/lightrag/base.py new file mode 100644 index 000000000..9c0422feb --- /dev/null +++ b/lightrag/base.py @@ -0,0 +1,116 @@ +from dataclasses import dataclass, field +from typing import TypedDict, Union, Literal, Generic, TypeVar + +import numpy as np + +from .utils import EmbeddingFunc + +TextChunkSchema = TypedDict( + "TextChunkSchema", + {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}, +) + +T = TypeVar("T") + +@dataclass +class QueryParam: + mode: Literal["local", "global", "hybird", "naive"] = "global" + only_need_context: bool = False + response_type: str = "Multiple Paragraphs" + top_k: int = 60 + max_token_for_text_unit: int = 4000 + max_token_for_global_context: int = 4000 + max_token_for_local_context: int = 4000 + + +@dataclass +class StorageNameSpace: + namespace: str + global_config: dict + + async def index_done_callback(self): + """commit the storage operations after indexing""" + pass + + async def query_done_callback(self): + """commit the storage operations after querying""" + pass + +@dataclass +class BaseVectorStorage(StorageNameSpace): + embedding_func: EmbeddingFunc + meta_fields: set = field(default_factory=set) + + async def query(self, query: str, top_k: int) -> list[dict]: + raise NotImplementedError + + async def upsert(self, data: dict[str, dict]): + """Use 'content' field from value for embedding, use key as id. + If embedding_func is None, use 'embedding' field from value + """ + raise NotImplementedError + +@dataclass +class BaseKVStorage(Generic[T], StorageNameSpace): + async def all_keys(self) -> list[str]: + raise NotImplementedError + + async def get_by_id(self, id: str) -> Union[T, None]: + raise NotImplementedError + + async def get_by_ids( + self, ids: list[str], fields: Union[set[str], None] = None + ) -> list[Union[T, None]]: + raise NotImplementedError + + async def filter_keys(self, data: list[str]) -> set[str]: + """return un-exist keys""" + raise NotImplementedError + + async def upsert(self, data: dict[str, T]): + raise NotImplementedError + + async def drop(self): + raise NotImplementedError + + +@dataclass +class BaseGraphStorage(StorageNameSpace): + async def has_node(self, node_id: str) -> bool: + raise NotImplementedError + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + raise NotImplementedError + + async def node_degree(self, node_id: str) -> int: + raise NotImplementedError + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + raise NotImplementedError + + async def get_node(self, node_id: str) -> Union[dict, None]: + raise NotImplementedError + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + raise NotImplementedError + + async def get_node_edges( + self, source_node_id: str + ) -> Union[list[tuple[str, str]], None]: + raise NotImplementedError + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + raise NotImplementedError + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + raise NotImplementedError + + async def clustering(self, algorithm: str): + raise NotImplementedError + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py new file mode 100644 index 000000000..836fda9ec --- /dev/null +++ b/lightrag/lightrag.py @@ -0,0 +1,300 @@ +import asyncio +import os +from dataclasses import asdict, dataclass, field +from datetime import datetime +from functools import partial +from typing import Type, cast + +from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding +from .operate import ( + chunking_by_token_size, + extract_entities, + local_query, + global_query, + hybird_query, + naive_query, +) + +from .storage import ( + JsonKVStorage, + NanoVectorDBStorage, + NetworkXStorage, +) +from .utils import ( + EmbeddingFunc, + compute_mdhash_id, + limit_async_func_call, + convert_response_to_json, + logger, + set_logger, +) +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, + StorageNameSpace, + QueryParam, +) + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + # If there is already an event loop, use it. + loop = asyncio.get_event_loop() + except RuntimeError: + # If in a sub-thread, create a new event loop. + logger.info("Creating a new event loop in a sub-thread.") + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + +@dataclass +class LightRAG: + working_dir: str = field( + default_factory=lambda: f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" + ) + + # text chunking + chunk_token_size: int = 1200 + chunk_overlap_token_size: int = 100 + tiktoken_model_name: str = "gpt-4o-mini" + + # entity extraction + entity_extract_max_gleaning: int = 1 + entity_summary_to_max_tokens: int = 500 + + # node embedding + node_embedding_algorithm: str = "node2vec" + node2vec_params: dict = field( + default_factory=lambda: { + "dimensions": 1536, + "num_walks": 10, + "walk_length": 40, + "num_walks": 10, + "window_size": 2, + "iterations": 3, + "random_seed": 3, + } + ) + + # text embedding + embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) + embedding_batch_num: int = 32 + embedding_func_max_async: int = 16 + + # LLM + llm_model_func: callable = gpt_4o_mini_complete + llm_model_max_token_size: int = 32768 + llm_model_max_async: int = 16 + + # storage + key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage + vector_db_storage_cls: Type[BaseVectorStorage] = NanoVectorDBStorage + vector_db_storage_cls_kwargs: dict = field(default_factory=dict) + graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage + enable_llm_cache: bool = True + + # extension + addon_params: dict = field(default_factory=dict) + convert_response_to_json_func: callable = convert_response_to_json + + def __post_init__(self): + log_file = os.path.join(self.working_dir, "lightrag.log") + set_logger(log_file) + logger.info(f"Logger initialized for working directory: {self.working_dir}") + + _print_config = ",\n ".join([f"{k} = {v}" for k, v in asdict(self).items()]) + logger.debug(f"LightRAG init with param:\n {_print_config}\n") + + if not os.path.exists(self.working_dir): + logger.info(f"Creating working directory {self.working_dir}") + os.makedirs(self.working_dir) + + self.full_docs = self.key_string_value_json_storage_cls( + namespace="full_docs", global_config=asdict(self) + ) + + self.text_chunks = self.key_string_value_json_storage_cls( + namespace="text_chunks", global_config=asdict(self) + ) + + self.llm_response_cache = ( + self.key_string_value_json_storage_cls( + namespace="llm_response_cache", global_config=asdict(self) + ) + if self.enable_llm_cache + else None + ) + self.chunk_entity_relation_graph = self.graph_storage_cls( + namespace="chunk_entity_relation", global_config=asdict(self) + ) + self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( + self.embedding_func + ) + self.entities_vdb = ( + self.vector_db_storage_cls( + namespace="entities", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"entity_name"} + ) + ) + self.relationships_vdb = ( + self.vector_db_storage_cls( + namespace="relationships", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"src_id", "tgt_id"} + ) + ) + self.chunks_vdb = ( + self.vector_db_storage_cls( + namespace="chunks", + global_config=asdict(self), + embedding_func=self.embedding_func, + ) + ) + + self.llm_model_func = limit_async_func_call(self.llm_model_max_async)( + partial(self.llm_model_func, hashing_kv=self.llm_response_cache) + ) + + def insert(self, string_or_strings): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert(string_or_strings)) + + async def ainsert(self, string_or_strings): + try: + if isinstance(string_or_strings, str): + string_or_strings = [string_or_strings] + + new_docs = { + compute_mdhash_id(c.strip(), prefix="doc-"): {"content": c.strip()} + for c in string_or_strings + } + _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not len(new_docs): + logger.warning(f"All docs are already in the storage") + return + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + inserting_chunks = {} + for doc_key, doc in new_docs.items(): + chunks = { + compute_mdhash_id(dp["content"], prefix="chunk-"): { + **dp, + "full_doc_id": doc_key, + } + for dp in chunking_by_token_size( + doc["content"], + overlap_token_size=self.chunk_overlap_token_size, + max_token_size=self.chunk_token_size, + tiktoken_model=self.tiktoken_model_name, + ) + } + inserting_chunks.update(chunks) + _add_chunk_keys = await self.text_chunks.filter_keys( + list(inserting_chunks.keys()) + ) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } + if not len(inserting_chunks): + logger.warning(f"All chunks are already in the storage") + return + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + + await self.chunks_vdb.upsert(inserting_chunks) + + logger.info("[Entity Extraction]...") + maybe_new_kg = await extract_entities( + inserting_chunks, + knwoledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=asdict(self), + ) + if maybe_new_kg is None: + logger.warning("No new entities and relationships found") + return + self.chunk_entity_relation_graph = maybe_new_kg + + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + finally: + await self._insert_done() + + async def _insert_done(self): + tasks = [] + for storage_inst in [ + self.full_docs, + self.text_chunks, + self.llm_response_cache, + self.entities_vdb, + self.relationships_vdb, + self.chunks_vdb, + self.chunk_entity_relation_graph, + ]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) + + def query(self, query: str, param: QueryParam = QueryParam()): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.aquery(query, param)) + + async def aquery(self, query: str, param: QueryParam = QueryParam()): + if param.mode == "local": + response = await local_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.relationships_vdb, + self.text_chunks, + param, + asdict(self), + ) + elif param.mode == "global": + response = await global_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.relationships_vdb, + self.text_chunks, + param, + asdict(self), + ) + elif param.mode == "hybird": + response = await hybird_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.relationships_vdb, + self.text_chunks, + param, + asdict(self), + ) + elif param.mode == "naive": + response = await naive_query( + query, + self.chunks_vdb, + self.text_chunks, + param, + asdict(self), + ) + else: + raise ValueError(f"Unknown mode {param.mode}") + await self._query_done() + return response + + + async def _query_done(self): + tasks = [] + for storage_inst in [self.llm_response_cache]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) + + diff --git a/lightrag/llm.py b/lightrag/llm.py new file mode 100644 index 000000000..ee700a104 --- /dev/null +++ b/lightrag/llm.py @@ -0,0 +1,88 @@ +import os +import numpy as np +from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type, +) + +from .base import BaseKVStorage +from .utils import compute_args_hash, wrap_embedding_func_with_attrs + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), +) +async def openai_complete_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI() + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + response = await openai_async_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": model}} + ) + return response.choices[0].message.content + +async def gpt_4o_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "gpt-4o", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +async def gpt_4o_mini_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), +) +async def openai_embedding(texts: list[str]) -> np.ndarray: + openai_async_client = AsyncOpenAI() + response = await openai_async_client.embeddings.create( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + return np.array([dp.embedding for dp in response.data]) + +if __name__ == "__main__": + import asyncio + + async def main(): + result = await gpt_4o_mini_complete('How are you?') + print(result) + + asyncio.run(main()) diff --git a/lightrag/operate.py b/lightrag/operate.py new file mode 100644 index 000000000..2d3271da8 --- /dev/null +++ b/lightrag/operate.py @@ -0,0 +1,944 @@ +import asyncio +import json +import re +from typing import Union +from collections import Counter, defaultdict + +from .utils import ( + logger, + clean_str, + compute_mdhash_id, + decode_tokens_by_tiktoken, + encode_string_by_tiktoken, + is_float_regex, + list_of_list_to_csv, + pack_user_ass_to_openai_messages, + split_string_by_multi_markers, + truncate_list_by_token_size, +) +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, + TextChunkSchema, + QueryParam, +) +from .prompt import GRAPH_FIELD_SEP, PROMPTS + +def chunking_by_token_size( + content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" +): + tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) + results = [] + for index, start in enumerate( + range(0, len(tokens), max_token_size - overlap_token_size) + ): + chunk_content = decode_tokens_by_tiktoken( + tokens[start : start + max_token_size], model_name=tiktoken_model + ) + results.append( + { + "tokens": min(max_token_size, len(tokens) - start), + "content": chunk_content.strip(), + "chunk_order_index": index, + } + ) + return results + +async def _handle_entity_relation_summary( + entity_or_relation_name: str, + description: str, + global_config: dict, +) -> str: + use_llm_func: callable = global_config["llm_model_func"] + llm_max_tokens = global_config["llm_model_max_token_size"] + tiktoken_model_name = global_config["tiktoken_model_name"] + summary_max_tokens = global_config["entity_summary_to_max_tokens"] + + tokens = encode_string_by_tiktoken(description, model_name=tiktoken_model_name) + if len(tokens) < summary_max_tokens: # No need for summary + return description + prompt_template = PROMPTS["summarize_entity_descriptions"] + use_description = decode_tokens_by_tiktoken( + tokens[:llm_max_tokens], model_name=tiktoken_model_name + ) + context_base = dict( + entity_name=entity_or_relation_name, + description_list=use_description.split(GRAPH_FIELD_SEP), + ) + use_prompt = prompt_template.format(**context_base) + logger.debug(f"Trigger summary: {entity_or_relation_name}") + summary = await use_llm_func(use_prompt, max_tokens=summary_max_tokens) + return summary + + +async def _handle_single_entity_extraction( + record_attributes: list[str], + chunk_key: str, +): + if record_attributes[0] != '"entity"' or len(record_attributes) < 4: + return None + # add this record as a node in the G + entity_name = clean_str(record_attributes[1].upper()) + if not entity_name.strip(): + return None + entity_type = clean_str(record_attributes[2].upper()) + entity_description = clean_str(record_attributes[3]) + entity_source_id = chunk_key + return dict( + entity_name=entity_name, + entity_type=entity_type, + description=entity_description, + source_id=entity_source_id, + ) + + +async def _handle_single_relationship_extraction( + record_attributes: list[str], + chunk_key: str, +): + if record_attributes[0] != '"relationship"' or len(record_attributes) < 5: + return None + # add this record as edge + source = clean_str(record_attributes[1].upper()) + target = clean_str(record_attributes[2].upper()) + edge_description = clean_str(record_attributes[3]) + + edge_keywords = clean_str(record_attributes[4]) + edge_source_id = chunk_key + weight = ( + float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0 + ) + return dict( + src_id=source, + tgt_id=target, + weight=weight, + description=edge_description, + keywords=edge_keywords, + source_id=edge_source_id, + ) + + +async def _merge_nodes_then_upsert( + entity_name: str, + nodes_data: list[dict], + knwoledge_graph_inst: BaseGraphStorage, + global_config: dict, +): + already_entitiy_types = [] + already_source_ids = [] + already_description = [] + + already_node = await knwoledge_graph_inst.get_node(entity_name) + if already_node is not None: + already_entitiy_types.append(already_node["entity_type"]) + already_source_ids.extend( + split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP]) + ) + already_description.append(already_node["description"]) + + entity_type = sorted( + Counter( + [dp["entity_type"] for dp in nodes_data] + already_entitiy_types + ).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + description = GRAPH_FIELD_SEP.join( + sorted(set([dp["description"] for dp in nodes_data] + already_description)) + ) + source_id = GRAPH_FIELD_SEP.join( + set([dp["source_id"] for dp in nodes_data] + already_source_ids) + ) + description = await _handle_entity_relation_summary( + entity_name, description, global_config + ) + node_data = dict( + entity_type=entity_type, + description=description, + source_id=source_id, + ) + await knwoledge_graph_inst.upsert_node( + entity_name, + node_data=node_data, + ) + node_data["entity_name"] = entity_name + return node_data + + +async def _merge_edges_then_upsert( + src_id: str, + tgt_id: str, + edges_data: list[dict], + knwoledge_graph_inst: BaseGraphStorage, + global_config: dict, +): + already_weights = [] + already_source_ids = [] + already_description = [] + already_keywords = [] + + if await knwoledge_graph_inst.has_edge(src_id, tgt_id): + already_edge = await knwoledge_graph_inst.get_edge(src_id, tgt_id) + already_weights.append(already_edge["weight"]) + already_source_ids.extend( + split_string_by_multi_markers(already_edge["source_id"], [GRAPH_FIELD_SEP]) + ) + already_description.append(already_edge["description"]) + already_keywords.extend( + split_string_by_multi_markers(already_edge["keywords"], [GRAPH_FIELD_SEP]) + ) + + weight = sum([dp["weight"] for dp in edges_data] + already_weights) + description = GRAPH_FIELD_SEP.join( + sorted(set([dp["description"] for dp in edges_data] + already_description)) + ) + keywords = GRAPH_FIELD_SEP.join( + sorted(set([dp["keywords"] for dp in edges_data] + already_keywords)) + ) + source_id = GRAPH_FIELD_SEP.join( + set([dp["source_id"] for dp in edges_data] + already_source_ids) + ) + for need_insert_id in [src_id, tgt_id]: + if not (await knwoledge_graph_inst.has_node(need_insert_id)): + await knwoledge_graph_inst.upsert_node( + need_insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": '"UNKNOWN"', + }, + ) + description = await _handle_entity_relation_summary( + (src_id, tgt_id), description, global_config + ) + await knwoledge_graph_inst.upsert_edge( + src_id, + tgt_id, + edge_data=dict( + weight=weight, + description=description, + keywords=keywords, + source_id=source_id, + ), + ) + + edge_data = dict( + src_id=src_id, + tgt_id=tgt_id, + description=description, + keywords=keywords, + ) + + return edge_data + +async def extract_entities( + chunks: dict[str, TextChunkSchema], + knwoledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + global_config: dict, +) -> Union[BaseGraphStorage, None]: + use_llm_func: callable = global_config["llm_model_func"] + entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] + + ordered_chunks = list(chunks.items()) + + entity_extract_prompt = PROMPTS["entity_extraction"] + context_base = dict( + tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], + record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"], + completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"], + entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]), + ) + continue_prompt = PROMPTS["entiti_continue_extraction"] + if_loop_prompt = PROMPTS["entiti_if_loop_extraction"] + + already_processed = 0 + already_entities = 0 + already_relations = 0 + + async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + nonlocal already_processed, already_entities, already_relations + chunk_key = chunk_key_dp[0] + chunk_dp = chunk_key_dp[1] + content = chunk_dp["content"] + hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) + final_result = await use_llm_func(hint_prompt) + + history = pack_user_ass_to_openai_messages(hint_prompt, final_result) + for now_glean_index in range(entity_extract_max_gleaning): + glean_result = await use_llm_func(continue_prompt, history_messages=history) + + history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) + final_result += glean_result + if now_glean_index == entity_extract_max_gleaning - 1: + break + + if_loop_result: str = await use_llm_func( + if_loop_prompt, history_messages=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + records = split_string_by_multi_markers( + final_result, + [context_base["record_delimiter"], context_base["completion_delimiter"]], + ) + + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for record in records: + record = re.search(r"\((.*)\)", record) + if record is None: + continue + record = record.group(1) + record_attributes = split_string_by_multi_markers( + record, [context_base["tuple_delimiter"]] + ) + if_entities = await _handle_single_entity_extraction( + record_attributes, chunk_key + ) + if if_entities is not None: + maybe_nodes[if_entities["entity_name"]].append(if_entities) + continue + + if_relation = await _handle_single_relationship_extraction( + record_attributes, chunk_key + ) + if if_relation is not None: + maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append( + if_relation + ) + already_processed += 1 + already_entities += len(maybe_nodes) + already_relations += len(maybe_edges) + now_ticks = PROMPTS["process_tickers"][ + already_processed % len(PROMPTS["process_tickers"]) + ] + print( + f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", + end="", + flush=True, + ) + return dict(maybe_nodes), dict(maybe_edges) + + # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings + results = await asyncio.gather( + *[_process_single_content(c) for c in ordered_chunks] + ) + print() # clear the progress bar + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for m_nodes, m_edges in results: + for k, v in m_nodes.items(): + maybe_nodes[k].extend(v) + for k, v in m_edges.items(): + maybe_edges[tuple(sorted(k))].extend(v) + all_entities_data = await asyncio.gather( + *[ + _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config) + for k, v in maybe_nodes.items() + ] + ) + all_relationships_data = await asyncio.gather( + *[ + _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config) + for k, v in maybe_edges.items() + ] + ) + if not len(all_entities_data): + logger.warning("Didn't extract any entities, maybe your LLM is not working") + return None + if not len(all_relationships_data): + logger.warning("Didn't extract any relationships, maybe your LLM is not working") + return None + + if entity_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["entity_name"], prefix="ent-"): { + "content": dp["entity_name"] + dp["description"], + "entity_name": dp["entity_name"], + } + for dp in all_entities_data + } + await entity_vdb.upsert(data_for_vdb) + + if relationships_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { + "src_id": dp["src_id"], + "tgt_id": dp["tgt_id"], + "content": dp["keywords"] + dp["src_id"] + dp["tgt_id"] + dp["description"], + } + for dp in all_relationships_data + } + await relationships_vdb.upsert(data_for_vdb) + + return knwoledge_graph_inst + +async def local_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +) -> str: + use_model_func = global_config["llm_model_func"] + + kw_prompt_temp = PROMPTS["keywords_extraction"] + kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) + + try: + keywords_data = json.loads(result) + keywords = keywords_data.get("low_level_keywords", []) + keywords = ', '.join(keywords) + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + context = await _build_local_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + sys_prompt_temp = PROMPTS["rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + +async def _build_local_query_context( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, +): + results = await entities_vdb.query(query, top_k=query_param.top_k) + if not len(results): + return None + node_datas = await asyncio.gather( + *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results] + ) + if not all([n is not None for n in node_datas]): + logger.warning("Some nodes are missing, maybe the storage is damaged") + node_degrees = await asyncio.gather( + *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results] + ) + node_datas = [ + {**n, "entity_name": k["entity_name"], "rank": d} + for k, n, d in zip(results, node_datas, node_degrees) + if n is not None + ] + use_text_units = await _find_most_related_text_unit_from_entities( + node_datas, query_param, text_chunks_db, knowledge_graph_inst + ) + use_relations = await _find_most_related_edges_from_entities( + node_datas, query_param, knowledge_graph_inst + ) + logger.info( + f"Local query uses {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} text units" + ) + entites_section_list = [["id", "entity", "type", "description", "rank"]] + for i, n in enumerate(node_datas): + entites_section_list.append( + [ + i, + n["entity_name"], + n.get("entity_type", "UNKNOWN"), + n.get("description", "UNKNOWN"), + n["rank"], + ] + ) + entities_context = list_of_list_to_csv(entites_section_list) + + relations_section_list = [ + ["id", "source", "target", "description", "keywords", "weight", "rank"] + ] + for i, e in enumerate(use_relations): + relations_section_list.append( + [ + i, + e["src_tgt"][0], + e["src_tgt"][1], + e["description"], + e["keywords"], + e["weight"], + e["rank"], + ] + ) + relations_context = list_of_list_to_csv(relations_section_list) + + text_units_section_list = [["id", "content"]] + for i, t in enumerate(use_text_units): + text_units_section_list.append([i, t["content"]]) + text_units_context = list_of_list_to_csv(text_units_section_list) + return f""" +-----Entities----- +```csv +{entities_context} +``` +-----Relationships----- +```csv +{relations_context} +``` +-----Sources----- +```csv +{text_units_context} +``` +""" + +async def _find_most_related_text_unit_from_entities( + node_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, +): + text_units = [ + split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) + for dp in node_datas + ] + edges = await asyncio.gather( + *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] + ) + all_one_hop_nodes = set() + for this_edges in edges: + if not this_edges: + continue + all_one_hop_nodes.update([e[1] for e in this_edges]) + all_one_hop_nodes = list(all_one_hop_nodes) + all_one_hop_nodes_data = await asyncio.gather( + *[knowledge_graph_inst.get_node(e) for e in all_one_hop_nodes] + ) + all_one_hop_text_units_lookup = { + k: set(split_string_by_multi_markers(v["source_id"], [GRAPH_FIELD_SEP])) + for k, v in zip(all_one_hop_nodes, all_one_hop_nodes_data) + if v is not None + } + all_text_units_lookup = {} + for index, (this_text_units, this_edges) in enumerate(zip(text_units, edges)): + for c_id in this_text_units: + if c_id in all_text_units_lookup: + continue + relation_counts = 0 + for e in this_edges: + if ( + e[1] in all_one_hop_text_units_lookup + and c_id in all_one_hop_text_units_lookup[e[1]] + ): + relation_counts += 1 + all_text_units_lookup[c_id] = { + "data": await text_chunks_db.get_by_id(c_id), + "order": index, + "relation_counts": relation_counts, + } + if any([v is None for v in all_text_units_lookup.values()]): + logger.warning("Text chunks are missing, maybe the storage is damaged") + all_text_units = [ + {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None + ] + all_text_units = sorted( + all_text_units, key=lambda x: (x["order"], -x["relation_counts"]) + ) + all_text_units = truncate_list_by_token_size( + all_text_units, + key=lambda x: x["data"]["content"], + max_token_size=query_param.max_token_for_text_unit, + ) + all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] + return all_text_units + +async def _find_most_related_edges_from_entities( + node_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, +): + all_related_edges = await asyncio.gather( + *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] + ) + all_edges = set() + for this_edges in all_related_edges: + all_edges.update([tuple(sorted(e)) for e in this_edges]) + all_edges = list(all_edges) + all_edges_pack = await asyncio.gather( + *[knowledge_graph_inst.get_edge(e[0], e[1]) for e in all_edges] + ) + all_edges_degree = await asyncio.gather( + *[knowledge_graph_inst.edge_degree(e[0], e[1]) for e in all_edges] + ) + all_edges_data = [ + {"src_tgt": k, "rank": d, **v} + for k, v, d in zip(all_edges, all_edges_pack, all_edges_degree) + if v is not None + ] + all_edges_data = sorted( + all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True + ) + all_edges_data = truncate_list_by_token_size( + all_edges_data, + key=lambda x: x["description"], + max_token_size=query_param.max_token_for_global_context, + ) + return all_edges_data + +async def global_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +) -> str: + use_model_func = global_config["llm_model_func"] + + kw_prompt_temp = PROMPTS["keywords_extraction"] + kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) + + try: + keywords_data = json.loads(result) + keywords = keywords_data.get("high_level_keywords", []) + keywords = ', '.join(keywords) + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + context = await _build_global_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) + + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + + sys_prompt_temp = PROMPTS["rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + +async def _build_global_query_context( + keywords, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, +): + results = await relationships_vdb.query(keywords, top_k=query_param.top_k) + + if not len(results): + return None + + edge_datas = await asyncio.gather( + *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] + ) + + if not all([n is not None for n in edge_datas]): + logger.warning("Some edges are missing, maybe the storage is damaged") + edge_degree = await asyncio.gather( + *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results] + ) + edge_datas = [ + {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v} + for k, v, d in zip(results, edge_datas, edge_degree) + if v is not None + ] + edge_datas = sorted( + edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True + ) + edge_datas = truncate_list_by_token_size( + edge_datas, + key=lambda x: x["description"], + max_token_size=query_param.max_token_for_global_context, + ) + + use_entities = await _find_most_related_entities_from_relationships( + edge_datas, query_param, knowledge_graph_inst + ) + use_text_units = await _find_related_text_unit_from_relationships( + edge_datas, query_param, text_chunks_db, knowledge_graph_inst + ) + logger.info( + f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} text units" + ) + relations_section_list = [ + ["id", "source", "target", "description", "keywords", "weight", "rank"] + ] + for i, e in enumerate(edge_datas): + relations_section_list.append( + [ + i, + e["src_id"], + e["tgt_id"], + e["description"], + e["keywords"], + e["weight"], + e["rank"], + ] + ) + relations_context = list_of_list_to_csv(relations_section_list) + + entites_section_list = [["id", "entity", "type", "description", "rank"]] + for i, n in enumerate(use_entities): + entites_section_list.append( + [ + i, + n["entity_name"], + n.get("entity_type", "UNKNOWN"), + n.get("description", "UNKNOWN"), + n["rank"], + ] + ) + entities_context = list_of_list_to_csv(entites_section_list) + + text_units_section_list = [["id", "content"]] + for i, t in enumerate(use_text_units): + text_units_section_list.append([i, t["content"]]) + text_units_context = list_of_list_to_csv(text_units_section_list) + + return f""" +-----Entities----- +```csv +{entities_context} +``` +-----Relationships----- +```csv +{relations_context} +``` +-----Sources----- +```csv +{text_units_context} +``` +""" + +async def _find_most_related_entities_from_relationships( + edge_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, +): + entity_names = set() + for e in edge_datas: + entity_names.add(e["src_id"]) + entity_names.add(e["tgt_id"]) + + node_datas = await asyncio.gather( + *[knowledge_graph_inst.get_node(entity_name) for entity_name in entity_names] + ) + + node_degrees = await asyncio.gather( + *[knowledge_graph_inst.node_degree(entity_name) for entity_name in entity_names] + ) + node_datas = [ + {**n, "entity_name": k, "rank": d} + for k, n, d in zip(entity_names, node_datas, node_degrees) + ] + + node_datas = truncate_list_by_token_size( + node_datas, + key=lambda x: x["description"], + max_token_size=query_param.max_token_for_local_context, + ) + + return node_datas + +async def _find_related_text_unit_from_relationships( + edge_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, +): + + text_units = [ + split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) + for dp in edge_datas + ] + + all_text_units_lookup = {} + + for index, unit_list in enumerate(text_units): + for c_id in unit_list: + if c_id not in all_text_units_lookup: + all_text_units_lookup[c_id] = { + "data": await text_chunks_db.get_by_id(c_id), + "order": index, + } + + if any([v is None for v in all_text_units_lookup.values()]): + logger.warning("Text chunks are missing, maybe the storage is damaged") + all_text_units = [ + {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None + ] + all_text_units = sorted( + all_text_units, key=lambda x: x["order"] + ) + all_text_units = truncate_list_by_token_size( + all_text_units, + key=lambda x: x["data"]["content"], + max_token_size=query_param.max_token_for_text_unit, + ) + all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] + + return all_text_units + +async def hybird_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +) -> str: + use_model_func = global_config["llm_model_func"] + + kw_prompt_temp = PROMPTS["keywords_extraction"] + kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) + + try: + keywords_data = json.loads(result) + hl_keywords = keywords_data.get("high_level_keywords", []) + ll_keywords = keywords_data.get("low_level_keywords", []) + hl_keywords = ', '.join(hl_keywords) + ll_keywords = ', '.join(ll_keywords) + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + low_level_context = await _build_local_query_context( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + + high_level_context = await _build_global_query_context( + hl_keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) + + context = combine_contexts(high_level_context, low_level_context) + + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + + sys_prompt_temp = PROMPTS["rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + +def combine_contexts(high_level_context, low_level_context): + # Function to extract entities, relationships, and sources from context strings + def extract_sections(context): + entities_match = re.search(r'-----Entities-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) + relationships_match = re.search(r'-----Relationships-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) + sources_match = re.search(r'-----Sources-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) + + entities = entities_match.group(1) if entities_match else '' + relationships = relationships_match.group(1) if relationships_match else '' + sources = sources_match.group(1) if sources_match else '' + + return entities, relationships, sources + + # Extract sections from both contexts + hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) + ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) + + # Combine and deduplicate the entities + combined_entities_set = set(filter(None, hl_entities.strip().split('\n') + ll_entities.strip().split('\n'))) + combined_entities = '\n'.join(combined_entities_set) + + # Combine and deduplicate the relationships + combined_relationships_set = set(filter(None, hl_relationships.strip().split('\n') + ll_relationships.strip().split('\n'))) + combined_relationships = '\n'.join(combined_relationships_set) + + # Combine and deduplicate the sources + combined_sources_set = set(filter(None, hl_sources.strip().split('\n') + ll_sources.strip().split('\n'))) + combined_sources = '\n'.join(combined_sources_set) + + # Format the combined context + return f""" +-----Entities----- +```csv +{combined_entities} +-----Relationships----- +{combined_relationships} +-----Sources----- +{combined_sources} +""" + +async def naive_query( + query, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +): + use_model_func = global_config["llm_model_func"] + results = await chunks_vdb.query(query, top_k=query_param.top_k) + if not len(results): + return PROMPTS["fail_response"] + chunks_ids = [r["id"] for r in results] + chunks = await text_chunks_db.get_by_ids(chunks_ids) + + maybe_trun_chunks = truncate_list_by_token_size( + chunks, + key=lambda x: x["content"], + max_token_size=query_param.max_token_for_text_unit, + ) + logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks") + section = "--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks]) + if query_param.only_need_context: + return section + sys_prompt_temp = PROMPTS["naive_rag_response"] + sys_prompt = sys_prompt_temp.format( + content_data=section, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + diff --git a/lightrag/prompt.py b/lightrag/prompt.py new file mode 100644 index 000000000..5d28e49c5 --- /dev/null +++ b/lightrag/prompt.py @@ -0,0 +1,256 @@ +GRAPH_FIELD_SEP = "" + +PROMPTS = {} + +PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" +PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" +PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" +PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] + +PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] + +PROMPTS[ + "entity_extraction" +] = """-Goal- +Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. + +-Steps- +1. Identify all entities. For each identified entity, extract the following information: +- entity_name: Name of the entity, capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_description: Comprehensive description of the entity's attributes and activities +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity +- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +###################### +-Examples- +###################### +Example 1: + +Entity_types: [person, technology, mission, organization, location] +Text: +while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. + +Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” + +The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. + +It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths +################ +Output: +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} +("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} +("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} +("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} +############################# +Example 2: + +Entity_types: [person, technology, mission, organization, location] +Text: +They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. + +Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. + +Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly +############# +Output: +("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} +("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} +("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} +("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} +############################# +Example 3: + +Entity_types: [person, role, technology, organization, event, location, concept] +Text: +their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. + +"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." + +Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." + +Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. + +The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation +############# +Output: +("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} +("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} +("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} +("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} +("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} +("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} +("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +PROMPTS[ + "summarize_entity_descriptions" +] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +PROMPTS[ + "entiti_continue_extraction" +] = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +PROMPTS[ + "entiti_if_loop_extraction" +] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." + +PROMPTS[ + "rag_response" +] = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + + +---Data tables--- + +{context_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Do not include information where the supporting evidence for it is not provided. + + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +PROMPTS["keywords_extraction"] = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. + +---Goal--- + +Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. + +---Instructions--- + +- Output the keywords in JSON format. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or details. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does international trade influence global economic stability?" +################ +Output: +{{ + "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], + "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] +}} +############################# +Example 2: + +Query: "What are the environmental consequences of deforestation on biodiversity?" +################ +Output: +{{ + "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], + "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] +}} +############################# +Example 3: + +Query: "What is the role of education in reducing poverty?" +################ +Output: +{{ + "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], + "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: + +""" + +PROMPTS[ + "naive_rag_response" +] = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/storage.py b/lightrag/storage.py new file mode 100644 index 000000000..2f2bb7d8f --- /dev/null +++ b/lightrag/storage.py @@ -0,0 +1,246 @@ +import asyncio +import html +import json +import os +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Union, cast +import pickle +import hnswlib +import networkx as nx +import numpy as np +from nano_vectordb import NanoVectorDB +import xxhash + +from .utils import load_json, logger, write_json +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, +) + +@dataclass +class JsonKVStorage(BaseKVStorage): + def __post_init__(self): + working_dir = self.global_config["working_dir"] + self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") + self._data = load_json(self._file_name) or {} + logger.info(f"Load KV {self.namespace} with {len(self._data)} data") + + async def all_keys(self) -> list[str]: + return list(self._data.keys()) + + async def index_done_callback(self): + write_json(self._data, self._file_name) + + async def get_by_id(self, id): + return self._data.get(id, None) + + async def get_by_ids(self, ids, fields=None): + if fields is None: + return [self._data.get(id, None) for id in ids] + return [ + ( + {k: v for k, v in self._data[id].items() if k in fields} + if self._data.get(id, None) + else None + ) + for id in ids + ] + + async def filter_keys(self, data: list[str]) -> set[str]: + return set([s for s in data if s not in self._data]) + + async def upsert(self, data: dict[str, dict]): + left_data = {k: v for k, v in data.items() if k not in self._data} + self._data.update(left_data) + return left_data + + async def drop(self): + self._data = {} + +@dataclass +class NanoVectorDBStorage(BaseVectorStorage): + cosine_better_than_threshold: float = 0.2 + + def __post_init__(self): + + self._client_file_name = os.path.join( + self.global_config["working_dir"], f"vdb_{self.namespace}.json" + ) + self._max_batch_size = self.global_config["embedding_batch_num"] + self._client = NanoVectorDB( + self.embedding_func.embedding_dim, storage_file=self._client_file_name + ) + self.cosine_better_than_threshold = self.global_config.get( + "cosine_better_than_threshold", self.cosine_better_than_threshold + ) + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + if not len(data): + logger.warning("You insert an empty data to vector DB") + return [] + list_data = [ + { + "__id__": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + for i, d in enumerate(list_data): + d["__vector__"] = embeddings[i] + results = self._client.upsert(datas=list_data) + return results + + async def query(self, query: str, top_k=5): + embedding = await self.embedding_func([query]) + embedding = embedding[0] + results = self._client.query( + query=embedding, + top_k=top_k, + better_than_threshold=self.cosine_better_than_threshold, + ) + results = [ + {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results + ] + return results + + async def index_done_callback(self): + self._client.save() + +@dataclass +class NetworkXStorage(BaseGraphStorage): + @staticmethod + def load_nx_graph(file_name) -> nx.Graph: + if os.path.exists(file_name): + return nx.read_graphml(file_name) + return None + + @staticmethod + def write_nx_graph(graph: nx.Graph, file_name): + logger.info( + f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges" + ) + nx.write_graphml(graph, file_name) + + @staticmethod + def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Return the largest connected component of the graph, with nodes and edges sorted in a stable way. + """ + from graspologic.utils import largest_connected_component + + graph = graph.copy() + graph = cast(nx.Graph, largest_connected_component(graph)) + node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore + graph = nx.relabel_nodes(graph, node_mapping) + return NetworkXStorage._stabilize_graph(graph) + + @staticmethod + def _stabilize_graph(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Ensure an undirected graph with the same relationships will always be read the same way. + """ + fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph() + + sorted_nodes = graph.nodes(data=True) + sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0]) + + fixed_graph.add_nodes_from(sorted_nodes) + edges = list(graph.edges(data=True)) + + if not graph.is_directed(): + + def _sort_source_target(edge): + source, target, edge_data = edge + if source > target: + temp = source + source = target + target = temp + return source, target, edge_data + + edges = [_sort_source_target(edge) for edge in edges] + + def _get_edge_key(source: Any, target: Any) -> str: + return f"{source} -> {target}" + + edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1])) + + fixed_graph.add_edges_from(edges) + return fixed_graph + + def __post_init__(self): + self._graphml_xml_file = os.path.join( + self.global_config["working_dir"], f"graph_{self.namespace}.graphml" + ) + preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) + if preloaded_graph is not None: + logger.info( + f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges" + ) + self._graph = preloaded_graph or nx.Graph() + self._node_embed_algorithms = { + "node2vec": self._node2vec_embed, + } + + async def index_done_callback(self): + NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) + + async def has_node(self, node_id: str) -> bool: + return self._graph.has_node(node_id) + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + return self._graph.has_edge(source_node_id, target_node_id) + + async def get_node(self, node_id: str) -> Union[dict, None]: + return self._graph.nodes.get(node_id) + + async def node_degree(self, node_id: str) -> int: + return self._graph.degree(node_id) + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + return self._graph.degree(src_id) + self._graph.degree(tgt_id) + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + return self._graph.edges.get((source_node_id, target_node_id)) + + async def get_node_edges(self, source_node_id: str): + if self._graph.has_node(source_node_id): + return list(self._graph.edges(source_node_id)) + return None + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + self._graph.add_node(node_id, **node_data) + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + self._graph.add_edge(source_node_id, target_node_id, **edge_data) + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + if algorithm not in self._node_embed_algorithms: + raise ValueError(f"Node embedding algorithm {algorithm} not supported") + return await self._node_embed_algorithms[algorithm]() + + async def _node2vec_embed(self): + from graspologic import embed + + embeddings, nodes = embed.node2vec_embed( + self._graph, + **self.global_config["node2vec_params"], + ) + + nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] + return embeddings, nodes_ids diff --git a/lightrag/utils.py b/lightrag/utils.py new file mode 100644 index 000000000..c75b4270c --- /dev/null +++ b/lightrag/utils.py @@ -0,0 +1,165 @@ +import asyncio +import html +import json +import logging +import os +import re +from dataclasses import dataclass +from functools import wraps +from hashlib import md5 +from typing import Any, Union + +import numpy as np +import tiktoken + +ENCODER = None + +logger = logging.getLogger("lightrag") + +def set_logger(log_file: str): + logger.setLevel(logging.DEBUG) + + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.DEBUG) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + + if not logger.handlers: + logger.addHandler(file_handler) + +@dataclass +class EmbeddingFunc: + embedding_dim: int + max_token_size: int + func: callable + + async def __call__(self, *args, **kwargs) -> np.ndarray: + return await self.func(*args, **kwargs) + +def locate_json_string_body_from_string(content: str) -> Union[str, None]: + """Locate the JSON string body from a string""" + maybe_json_str = re.search(r"{.*}", content, re.DOTALL) + if maybe_json_str is not None: + return maybe_json_str.group(0) + else: + return None + +def convert_response_to_json(response: str) -> dict: + json_str = locate_json_string_body_from_string(response) + assert json_str is not None, f"Unable to parse JSON from response: {response}" + try: + data = json.loads(json_str) + return data + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON: {json_str}") + raise e from None + +def compute_args_hash(*args): + return md5(str(args).encode()).hexdigest() + +def compute_mdhash_id(content, prefix: str = ""): + return prefix + md5(content.encode()).hexdigest() + +def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): + """Add restriction of maximum async calling times for a async func""" + + def final_decro(func): + """Not using async.Semaphore to aovid use nest-asyncio""" + __current_size = 0 + + @wraps(func) + async def wait_func(*args, **kwargs): + nonlocal __current_size + while __current_size >= max_size: + await asyncio.sleep(waitting_time) + __current_size += 1 + result = await func(*args, **kwargs) + __current_size -= 1 + return result + + return wait_func + + return final_decro + +def wrap_embedding_func_with_attrs(**kwargs): + """Wrap a function with attributes""" + + def final_decro(func) -> EmbeddingFunc: + new_func = EmbeddingFunc(**kwargs, func=func) + return new_func + + return final_decro + +def load_json(file_name): + if not os.path.exists(file_name): + return None + with open(file_name) as f: + return json.load(f) + +def write_json(json_obj, file_name): + with open(file_name, "w") as f: + json.dump(json_obj, f, indent=2, ensure_ascii=False) + +def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): + global ENCODER + if ENCODER is None: + ENCODER = tiktoken.encoding_for_model(model_name) + tokens = ENCODER.encode(content) + return tokens + + +def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): + global ENCODER + if ENCODER is None: + ENCODER = tiktoken.encoding_for_model(model_name) + content = ENCODER.decode(tokens) + return content + +def pack_user_ass_to_openai_messages(*args: str): + roles = ["user", "assistant"] + return [ + {"role": roles[i % 2], "content": content} for i, content in enumerate(args) + ] + +def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: + """Split a string by multiple markers""" + if not markers: + return [content] + results = re.split("|".join(re.escape(marker) for marker in markers), content) + return [r.strip() for r in results if r.strip()] + +# Refer the utils functions of the official GraphRAG implementation: +# https://github.com/microsoft/graphrag +def clean_str(input: Any) -> str: + """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" + # If we get non-string input, just give it back + if not isinstance(input, str): + return input + + result = html.unescape(input.strip()) + # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python + return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + +def is_float_regex(value): + return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + +def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): + """Truncate a list of data by token size""" + if max_token_size <= 0: + return [] + tokens = 0 + for i, data in enumerate(list_data): + tokens += len(encode_string_by_tiktoken(key(data))) + if tokens > max_token_size: + return list_data[:i] + return list_data + +def list_of_list_to_csv(data: list[list]): + return "\n".join( + [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] + ) + +def save_data_to_file(data, file_name): + with open(file_name, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file From 81209fa96be951216323ca65bd02e4f6dd51ccf0 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 10 Oct 2024 15:02:30 +0800 Subject: [PATCH 10/67] update --- README.md | 198 +++++++++ lightrag/__init__.py | 5 + lightrag/base.py | 116 ++++++ lightrag/lightrag.py | 300 ++++++++++++++ lightrag/llm.py | 88 ++++ lightrag/operate.py | 944 +++++++++++++++++++++++++++++++++++++++++++ lightrag/prompt.py | 256 ++++++++++++ lightrag/storage.py | 246 +++++++++++ lightrag/utils.py | 165 ++++++++ 9 files changed, 2318 insertions(+) create mode 100644 README.md create mode 100644 lightrag/__init__.py create mode 100644 lightrag/base.py create mode 100644 lightrag/lightrag.py create mode 100644 lightrag/llm.py create mode 100644 lightrag/operate.py create mode 100644 lightrag/prompt.py create mode 100644 lightrag/storage.py create mode 100644 lightrag/utils.py diff --git a/README.md b/README.md new file mode 100644 index 000000000..42de1c1cb --- /dev/null +++ b/README.md @@ -0,0 +1,198 @@ +# LightRAG: Simple and Fast Retrieval-Augmented Generation +![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) + + + + + + +This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). +![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) +## Install + +* Install from source + +```bash +cd LightRAG +pip install -e . +``` +* Install from PyPI +```bash +pip install lightrag-hku +``` + +## Quick Start + +* Set OpenAI API key in environment: `export OPENAI_API_KEY="sk-...".` +* Download the demo text "A Christmas Carol by Charles Dickens" +```bash +curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt +``` +Use the below python snippet: + +```python +from lightrag import LightRAG, QueryParam + +rag = LightRAG(working_dir="./dickens") + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybird search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybird"))) +``` +Batch Insert +```python +rag.insert(["TEXT1", "TEXT2",...]) +``` +Incremental Insert + +```python +rag = LightRAG(working_dir="./dickens") + +with open("./newText.txt") as f: + rag.insert(f.read()) +``` +## Evaluation +### Dataset +The dataset used in LightRAG can be download from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). + +### Generate Query +LightRAG uses the following prompt to generate high-level queries, with the corresponding code located in `example/generate_query.py`. +```python +Given the following description of a dataset: + +{description} + +Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. + +Output the results in the following structure: +- User 1: [user description] + - Task 1: [task description] + - Question 1: + - Question 2: + - Question 3: + - Question 4: + - Question 5: + - Task 2: [task description] + ... + - Task 5: [task description] +- User 2: [user description] + ... +- User 5: [user description] + ... +``` + + ### Batch Eval +To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`. +```python +---Role--- +You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. +---Goal--- +You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + +- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? +- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? +- **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? + +For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories. + +Here is the question: +{query} + +Here are the two answers: + +**Answer 1:** +{answer1} + +**Answer 2:** +{answer2} + +Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. + +Output your evaluation in the following JSON format: + +{{ + "Comprehensiveness": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation here]" + }}, + "Empowerment": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation here]" + }}, + "Overall Winner": {{ + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]" + }} +}} +``` +### Overall Performance Table +### Overall Performance Table +| | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | +|----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| +| | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | +| **Comprehensiveness** | 32.69% | **67.31%** | 35.44% | **64.56%** | 19.05% | **80.95%** | 36.36% | **63.64%** | +| **Diversity** | 24.09% | **75.91%** | 35.24% | **64.76%** | 10.98% | **89.02%** | 30.76% | **69.24%** | +| **Empowerment** | 31.35% | **68.65%** | 35.48% | **64.52%** | 17.59% | **82.41%** | 40.95% | **59.05%** | +| **Overall** | 33.30% | **66.70%** | 34.76% | **65.24%** | 17.46% | **82.54%** | 37.59% | **62.40%** | +| | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | +| **Comprehensiveness** | 32.05% | **67.95%** | 39.30% | **60.70%** | 18.57% | **81.43%** | 38.89% | **61.11%** | +| **Diversity** | 29.44% | **70.56%** | 38.71% | **61.29%** | 15.14% | **84.86%** | 28.50% | **71.50%** | +| **Empowerment** | 32.51% | **67.49%** | 37.52% | **62.48%** | 17.80% | **82.20%** | 43.96% | **56.04%** | +| **Overall** | 33.29% | **66.71%** | 39.03% | **60.97%** | 17.80% | **82.20%** | 39.61% | **60.39%** | +| | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | +| **Comprehensiveness** | 24.39% | **75.61%** | 36.49% | **63.51%** | 27.68% | **72.32%** | 42.17% | **57.83%** | +| **Diversity** | 24.96% | **75.34%** | 37.41% | **62.59%** | 18.79% | **81.21%** | 30.88% | **69.12%** | +| **Empowerment** | 24.89% | **75.11%** | 34.99% | **65.01%** | 26.99% | **73.01%** | **45.61%** | **54.39%** | +| **Overall** | 23.17% | **76.83%** | 35.67% | **64.33%** | 27.68% | **72.32%** | 42.72% | **57.28%** | +| | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | +| **Comprehensiveness** | 45.56% | **54.44%** | 45.98% | **54.02%** | 47.13% | **52.87%** | **51.86%** | 48.14% | +| **Diversity** | 19.65% | **80.35%** | 39.64% | **60.36%** | 25.55% | **74.45%** | 35.87% | **64.13%** | +| **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | +| **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | + +## Code Structure + +```python +. +├── examples +│ ├── batch_eval.py +│ ├── generate_query.py +│ ├── insert.py +│ └── query.py +├── lightrag +│ ├── __init__.py +│ ├── base.py +│ ├── lightrag.py +│ ├── llm.py +│ ├── operate.py +│ ├── prompt.py +│ ├── storage.py +│ └── utils.jpeg +├── LICENSE +├── README.md +├── requirements.txt +└── setup.py +``` +## Citation + +``` +@article{guo2024lightrag, +title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, +author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, +year={2024}, +eprint={2410.05779}, +archivePrefix={arXiv}, +primaryClass={cs.IR} +} +``` diff --git a/lightrag/__init__.py b/lightrag/__init__.py new file mode 100644 index 000000000..dc497cd44 --- /dev/null +++ b/lightrag/__init__.py @@ -0,0 +1,5 @@ +from .lightrag import LightRAG, QueryParam + +__version__ = "0.0.2" +__author__ = "Zirui Guo" +__url__ = "https://github.com/HKUDS/GraphEdit" diff --git a/lightrag/base.py b/lightrag/base.py new file mode 100644 index 000000000..9c0422feb --- /dev/null +++ b/lightrag/base.py @@ -0,0 +1,116 @@ +from dataclasses import dataclass, field +from typing import TypedDict, Union, Literal, Generic, TypeVar + +import numpy as np + +from .utils import EmbeddingFunc + +TextChunkSchema = TypedDict( + "TextChunkSchema", + {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}, +) + +T = TypeVar("T") + +@dataclass +class QueryParam: + mode: Literal["local", "global", "hybird", "naive"] = "global" + only_need_context: bool = False + response_type: str = "Multiple Paragraphs" + top_k: int = 60 + max_token_for_text_unit: int = 4000 + max_token_for_global_context: int = 4000 + max_token_for_local_context: int = 4000 + + +@dataclass +class StorageNameSpace: + namespace: str + global_config: dict + + async def index_done_callback(self): + """commit the storage operations after indexing""" + pass + + async def query_done_callback(self): + """commit the storage operations after querying""" + pass + +@dataclass +class BaseVectorStorage(StorageNameSpace): + embedding_func: EmbeddingFunc + meta_fields: set = field(default_factory=set) + + async def query(self, query: str, top_k: int) -> list[dict]: + raise NotImplementedError + + async def upsert(self, data: dict[str, dict]): + """Use 'content' field from value for embedding, use key as id. + If embedding_func is None, use 'embedding' field from value + """ + raise NotImplementedError + +@dataclass +class BaseKVStorage(Generic[T], StorageNameSpace): + async def all_keys(self) -> list[str]: + raise NotImplementedError + + async def get_by_id(self, id: str) -> Union[T, None]: + raise NotImplementedError + + async def get_by_ids( + self, ids: list[str], fields: Union[set[str], None] = None + ) -> list[Union[T, None]]: + raise NotImplementedError + + async def filter_keys(self, data: list[str]) -> set[str]: + """return un-exist keys""" + raise NotImplementedError + + async def upsert(self, data: dict[str, T]): + raise NotImplementedError + + async def drop(self): + raise NotImplementedError + + +@dataclass +class BaseGraphStorage(StorageNameSpace): + async def has_node(self, node_id: str) -> bool: + raise NotImplementedError + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + raise NotImplementedError + + async def node_degree(self, node_id: str) -> int: + raise NotImplementedError + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + raise NotImplementedError + + async def get_node(self, node_id: str) -> Union[dict, None]: + raise NotImplementedError + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + raise NotImplementedError + + async def get_node_edges( + self, source_node_id: str + ) -> Union[list[tuple[str, str]], None]: + raise NotImplementedError + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + raise NotImplementedError + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + raise NotImplementedError + + async def clustering(self, algorithm: str): + raise NotImplementedError + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py new file mode 100644 index 000000000..836fda9ec --- /dev/null +++ b/lightrag/lightrag.py @@ -0,0 +1,300 @@ +import asyncio +import os +from dataclasses import asdict, dataclass, field +from datetime import datetime +from functools import partial +from typing import Type, cast + +from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding +from .operate import ( + chunking_by_token_size, + extract_entities, + local_query, + global_query, + hybird_query, + naive_query, +) + +from .storage import ( + JsonKVStorage, + NanoVectorDBStorage, + NetworkXStorage, +) +from .utils import ( + EmbeddingFunc, + compute_mdhash_id, + limit_async_func_call, + convert_response_to_json, + logger, + set_logger, +) +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, + StorageNameSpace, + QueryParam, +) + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + # If there is already an event loop, use it. + loop = asyncio.get_event_loop() + except RuntimeError: + # If in a sub-thread, create a new event loop. + logger.info("Creating a new event loop in a sub-thread.") + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + +@dataclass +class LightRAG: + working_dir: str = field( + default_factory=lambda: f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" + ) + + # text chunking + chunk_token_size: int = 1200 + chunk_overlap_token_size: int = 100 + tiktoken_model_name: str = "gpt-4o-mini" + + # entity extraction + entity_extract_max_gleaning: int = 1 + entity_summary_to_max_tokens: int = 500 + + # node embedding + node_embedding_algorithm: str = "node2vec" + node2vec_params: dict = field( + default_factory=lambda: { + "dimensions": 1536, + "num_walks": 10, + "walk_length": 40, + "num_walks": 10, + "window_size": 2, + "iterations": 3, + "random_seed": 3, + } + ) + + # text embedding + embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) + embedding_batch_num: int = 32 + embedding_func_max_async: int = 16 + + # LLM + llm_model_func: callable = gpt_4o_mini_complete + llm_model_max_token_size: int = 32768 + llm_model_max_async: int = 16 + + # storage + key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage + vector_db_storage_cls: Type[BaseVectorStorage] = NanoVectorDBStorage + vector_db_storage_cls_kwargs: dict = field(default_factory=dict) + graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage + enable_llm_cache: bool = True + + # extension + addon_params: dict = field(default_factory=dict) + convert_response_to_json_func: callable = convert_response_to_json + + def __post_init__(self): + log_file = os.path.join(self.working_dir, "lightrag.log") + set_logger(log_file) + logger.info(f"Logger initialized for working directory: {self.working_dir}") + + _print_config = ",\n ".join([f"{k} = {v}" for k, v in asdict(self).items()]) + logger.debug(f"LightRAG init with param:\n {_print_config}\n") + + if not os.path.exists(self.working_dir): + logger.info(f"Creating working directory {self.working_dir}") + os.makedirs(self.working_dir) + + self.full_docs = self.key_string_value_json_storage_cls( + namespace="full_docs", global_config=asdict(self) + ) + + self.text_chunks = self.key_string_value_json_storage_cls( + namespace="text_chunks", global_config=asdict(self) + ) + + self.llm_response_cache = ( + self.key_string_value_json_storage_cls( + namespace="llm_response_cache", global_config=asdict(self) + ) + if self.enable_llm_cache + else None + ) + self.chunk_entity_relation_graph = self.graph_storage_cls( + namespace="chunk_entity_relation", global_config=asdict(self) + ) + self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( + self.embedding_func + ) + self.entities_vdb = ( + self.vector_db_storage_cls( + namespace="entities", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"entity_name"} + ) + ) + self.relationships_vdb = ( + self.vector_db_storage_cls( + namespace="relationships", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"src_id", "tgt_id"} + ) + ) + self.chunks_vdb = ( + self.vector_db_storage_cls( + namespace="chunks", + global_config=asdict(self), + embedding_func=self.embedding_func, + ) + ) + + self.llm_model_func = limit_async_func_call(self.llm_model_max_async)( + partial(self.llm_model_func, hashing_kv=self.llm_response_cache) + ) + + def insert(self, string_or_strings): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert(string_or_strings)) + + async def ainsert(self, string_or_strings): + try: + if isinstance(string_or_strings, str): + string_or_strings = [string_or_strings] + + new_docs = { + compute_mdhash_id(c.strip(), prefix="doc-"): {"content": c.strip()} + for c in string_or_strings + } + _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not len(new_docs): + logger.warning(f"All docs are already in the storage") + return + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + inserting_chunks = {} + for doc_key, doc in new_docs.items(): + chunks = { + compute_mdhash_id(dp["content"], prefix="chunk-"): { + **dp, + "full_doc_id": doc_key, + } + for dp in chunking_by_token_size( + doc["content"], + overlap_token_size=self.chunk_overlap_token_size, + max_token_size=self.chunk_token_size, + tiktoken_model=self.tiktoken_model_name, + ) + } + inserting_chunks.update(chunks) + _add_chunk_keys = await self.text_chunks.filter_keys( + list(inserting_chunks.keys()) + ) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } + if not len(inserting_chunks): + logger.warning(f"All chunks are already in the storage") + return + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + + await self.chunks_vdb.upsert(inserting_chunks) + + logger.info("[Entity Extraction]...") + maybe_new_kg = await extract_entities( + inserting_chunks, + knwoledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=asdict(self), + ) + if maybe_new_kg is None: + logger.warning("No new entities and relationships found") + return + self.chunk_entity_relation_graph = maybe_new_kg + + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + finally: + await self._insert_done() + + async def _insert_done(self): + tasks = [] + for storage_inst in [ + self.full_docs, + self.text_chunks, + self.llm_response_cache, + self.entities_vdb, + self.relationships_vdb, + self.chunks_vdb, + self.chunk_entity_relation_graph, + ]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) + + def query(self, query: str, param: QueryParam = QueryParam()): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.aquery(query, param)) + + async def aquery(self, query: str, param: QueryParam = QueryParam()): + if param.mode == "local": + response = await local_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.relationships_vdb, + self.text_chunks, + param, + asdict(self), + ) + elif param.mode == "global": + response = await global_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.relationships_vdb, + self.text_chunks, + param, + asdict(self), + ) + elif param.mode == "hybird": + response = await hybird_query( + query, + self.chunk_entity_relation_graph, + self.entities_vdb, + self.relationships_vdb, + self.text_chunks, + param, + asdict(self), + ) + elif param.mode == "naive": + response = await naive_query( + query, + self.chunks_vdb, + self.text_chunks, + param, + asdict(self), + ) + else: + raise ValueError(f"Unknown mode {param.mode}") + await self._query_done() + return response + + + async def _query_done(self): + tasks = [] + for storage_inst in [self.llm_response_cache]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) + + diff --git a/lightrag/llm.py b/lightrag/llm.py new file mode 100644 index 000000000..ee700a104 --- /dev/null +++ b/lightrag/llm.py @@ -0,0 +1,88 @@ +import os +import numpy as np +from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type, +) + +from .base import BaseKVStorage +from .utils import compute_args_hash, wrap_embedding_func_with_attrs + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), +) +async def openai_complete_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_async_client = AsyncOpenAI() + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + response = await openai_async_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response.choices[0].message.content, "model": model}} + ) + return response.choices[0].message.content + +async def gpt_4o_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "gpt-4o", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +async def gpt_4o_mini_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "gpt-4o-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), +) +async def openai_embedding(texts: list[str]) -> np.ndarray: + openai_async_client = AsyncOpenAI() + response = await openai_async_client.embeddings.create( + model="text-embedding-3-small", input=texts, encoding_format="float" + ) + return np.array([dp.embedding for dp in response.data]) + +if __name__ == "__main__": + import asyncio + + async def main(): + result = await gpt_4o_mini_complete('How are you?') + print(result) + + asyncio.run(main()) diff --git a/lightrag/operate.py b/lightrag/operate.py new file mode 100644 index 000000000..2d3271da8 --- /dev/null +++ b/lightrag/operate.py @@ -0,0 +1,944 @@ +import asyncio +import json +import re +from typing import Union +from collections import Counter, defaultdict + +from .utils import ( + logger, + clean_str, + compute_mdhash_id, + decode_tokens_by_tiktoken, + encode_string_by_tiktoken, + is_float_regex, + list_of_list_to_csv, + pack_user_ass_to_openai_messages, + split_string_by_multi_markers, + truncate_list_by_token_size, +) +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, + TextChunkSchema, + QueryParam, +) +from .prompt import GRAPH_FIELD_SEP, PROMPTS + +def chunking_by_token_size( + content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" +): + tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) + results = [] + for index, start in enumerate( + range(0, len(tokens), max_token_size - overlap_token_size) + ): + chunk_content = decode_tokens_by_tiktoken( + tokens[start : start + max_token_size], model_name=tiktoken_model + ) + results.append( + { + "tokens": min(max_token_size, len(tokens) - start), + "content": chunk_content.strip(), + "chunk_order_index": index, + } + ) + return results + +async def _handle_entity_relation_summary( + entity_or_relation_name: str, + description: str, + global_config: dict, +) -> str: + use_llm_func: callable = global_config["llm_model_func"] + llm_max_tokens = global_config["llm_model_max_token_size"] + tiktoken_model_name = global_config["tiktoken_model_name"] + summary_max_tokens = global_config["entity_summary_to_max_tokens"] + + tokens = encode_string_by_tiktoken(description, model_name=tiktoken_model_name) + if len(tokens) < summary_max_tokens: # No need for summary + return description + prompt_template = PROMPTS["summarize_entity_descriptions"] + use_description = decode_tokens_by_tiktoken( + tokens[:llm_max_tokens], model_name=tiktoken_model_name + ) + context_base = dict( + entity_name=entity_or_relation_name, + description_list=use_description.split(GRAPH_FIELD_SEP), + ) + use_prompt = prompt_template.format(**context_base) + logger.debug(f"Trigger summary: {entity_or_relation_name}") + summary = await use_llm_func(use_prompt, max_tokens=summary_max_tokens) + return summary + + +async def _handle_single_entity_extraction( + record_attributes: list[str], + chunk_key: str, +): + if record_attributes[0] != '"entity"' or len(record_attributes) < 4: + return None + # add this record as a node in the G + entity_name = clean_str(record_attributes[1].upper()) + if not entity_name.strip(): + return None + entity_type = clean_str(record_attributes[2].upper()) + entity_description = clean_str(record_attributes[3]) + entity_source_id = chunk_key + return dict( + entity_name=entity_name, + entity_type=entity_type, + description=entity_description, + source_id=entity_source_id, + ) + + +async def _handle_single_relationship_extraction( + record_attributes: list[str], + chunk_key: str, +): + if record_attributes[0] != '"relationship"' or len(record_attributes) < 5: + return None + # add this record as edge + source = clean_str(record_attributes[1].upper()) + target = clean_str(record_attributes[2].upper()) + edge_description = clean_str(record_attributes[3]) + + edge_keywords = clean_str(record_attributes[4]) + edge_source_id = chunk_key + weight = ( + float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0 + ) + return dict( + src_id=source, + tgt_id=target, + weight=weight, + description=edge_description, + keywords=edge_keywords, + source_id=edge_source_id, + ) + + +async def _merge_nodes_then_upsert( + entity_name: str, + nodes_data: list[dict], + knwoledge_graph_inst: BaseGraphStorage, + global_config: dict, +): + already_entitiy_types = [] + already_source_ids = [] + already_description = [] + + already_node = await knwoledge_graph_inst.get_node(entity_name) + if already_node is not None: + already_entitiy_types.append(already_node["entity_type"]) + already_source_ids.extend( + split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP]) + ) + already_description.append(already_node["description"]) + + entity_type = sorted( + Counter( + [dp["entity_type"] for dp in nodes_data] + already_entitiy_types + ).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + description = GRAPH_FIELD_SEP.join( + sorted(set([dp["description"] for dp in nodes_data] + already_description)) + ) + source_id = GRAPH_FIELD_SEP.join( + set([dp["source_id"] for dp in nodes_data] + already_source_ids) + ) + description = await _handle_entity_relation_summary( + entity_name, description, global_config + ) + node_data = dict( + entity_type=entity_type, + description=description, + source_id=source_id, + ) + await knwoledge_graph_inst.upsert_node( + entity_name, + node_data=node_data, + ) + node_data["entity_name"] = entity_name + return node_data + + +async def _merge_edges_then_upsert( + src_id: str, + tgt_id: str, + edges_data: list[dict], + knwoledge_graph_inst: BaseGraphStorage, + global_config: dict, +): + already_weights = [] + already_source_ids = [] + already_description = [] + already_keywords = [] + + if await knwoledge_graph_inst.has_edge(src_id, tgt_id): + already_edge = await knwoledge_graph_inst.get_edge(src_id, tgt_id) + already_weights.append(already_edge["weight"]) + already_source_ids.extend( + split_string_by_multi_markers(already_edge["source_id"], [GRAPH_FIELD_SEP]) + ) + already_description.append(already_edge["description"]) + already_keywords.extend( + split_string_by_multi_markers(already_edge["keywords"], [GRAPH_FIELD_SEP]) + ) + + weight = sum([dp["weight"] for dp in edges_data] + already_weights) + description = GRAPH_FIELD_SEP.join( + sorted(set([dp["description"] for dp in edges_data] + already_description)) + ) + keywords = GRAPH_FIELD_SEP.join( + sorted(set([dp["keywords"] for dp in edges_data] + already_keywords)) + ) + source_id = GRAPH_FIELD_SEP.join( + set([dp["source_id"] for dp in edges_data] + already_source_ids) + ) + for need_insert_id in [src_id, tgt_id]: + if not (await knwoledge_graph_inst.has_node(need_insert_id)): + await knwoledge_graph_inst.upsert_node( + need_insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": '"UNKNOWN"', + }, + ) + description = await _handle_entity_relation_summary( + (src_id, tgt_id), description, global_config + ) + await knwoledge_graph_inst.upsert_edge( + src_id, + tgt_id, + edge_data=dict( + weight=weight, + description=description, + keywords=keywords, + source_id=source_id, + ), + ) + + edge_data = dict( + src_id=src_id, + tgt_id=tgt_id, + description=description, + keywords=keywords, + ) + + return edge_data + +async def extract_entities( + chunks: dict[str, TextChunkSchema], + knwoledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + global_config: dict, +) -> Union[BaseGraphStorage, None]: + use_llm_func: callable = global_config["llm_model_func"] + entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] + + ordered_chunks = list(chunks.items()) + + entity_extract_prompt = PROMPTS["entity_extraction"] + context_base = dict( + tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], + record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"], + completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"], + entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]), + ) + continue_prompt = PROMPTS["entiti_continue_extraction"] + if_loop_prompt = PROMPTS["entiti_if_loop_extraction"] + + already_processed = 0 + already_entities = 0 + already_relations = 0 + + async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + nonlocal already_processed, already_entities, already_relations + chunk_key = chunk_key_dp[0] + chunk_dp = chunk_key_dp[1] + content = chunk_dp["content"] + hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) + final_result = await use_llm_func(hint_prompt) + + history = pack_user_ass_to_openai_messages(hint_prompt, final_result) + for now_glean_index in range(entity_extract_max_gleaning): + glean_result = await use_llm_func(continue_prompt, history_messages=history) + + history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) + final_result += glean_result + if now_glean_index == entity_extract_max_gleaning - 1: + break + + if_loop_result: str = await use_llm_func( + if_loop_prompt, history_messages=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + records = split_string_by_multi_markers( + final_result, + [context_base["record_delimiter"], context_base["completion_delimiter"]], + ) + + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for record in records: + record = re.search(r"\((.*)\)", record) + if record is None: + continue + record = record.group(1) + record_attributes = split_string_by_multi_markers( + record, [context_base["tuple_delimiter"]] + ) + if_entities = await _handle_single_entity_extraction( + record_attributes, chunk_key + ) + if if_entities is not None: + maybe_nodes[if_entities["entity_name"]].append(if_entities) + continue + + if_relation = await _handle_single_relationship_extraction( + record_attributes, chunk_key + ) + if if_relation is not None: + maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append( + if_relation + ) + already_processed += 1 + already_entities += len(maybe_nodes) + already_relations += len(maybe_edges) + now_ticks = PROMPTS["process_tickers"][ + already_processed % len(PROMPTS["process_tickers"]) + ] + print( + f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", + end="", + flush=True, + ) + return dict(maybe_nodes), dict(maybe_edges) + + # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings + results = await asyncio.gather( + *[_process_single_content(c) for c in ordered_chunks] + ) + print() # clear the progress bar + maybe_nodes = defaultdict(list) + maybe_edges = defaultdict(list) + for m_nodes, m_edges in results: + for k, v in m_nodes.items(): + maybe_nodes[k].extend(v) + for k, v in m_edges.items(): + maybe_edges[tuple(sorted(k))].extend(v) + all_entities_data = await asyncio.gather( + *[ + _merge_nodes_then_upsert(k, v, knwoledge_graph_inst, global_config) + for k, v in maybe_nodes.items() + ] + ) + all_relationships_data = await asyncio.gather( + *[ + _merge_edges_then_upsert(k[0], k[1], v, knwoledge_graph_inst, global_config) + for k, v in maybe_edges.items() + ] + ) + if not len(all_entities_data): + logger.warning("Didn't extract any entities, maybe your LLM is not working") + return None + if not len(all_relationships_data): + logger.warning("Didn't extract any relationships, maybe your LLM is not working") + return None + + if entity_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["entity_name"], prefix="ent-"): { + "content": dp["entity_name"] + dp["description"], + "entity_name": dp["entity_name"], + } + for dp in all_entities_data + } + await entity_vdb.upsert(data_for_vdb) + + if relationships_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { + "src_id": dp["src_id"], + "tgt_id": dp["tgt_id"], + "content": dp["keywords"] + dp["src_id"] + dp["tgt_id"] + dp["description"], + } + for dp in all_relationships_data + } + await relationships_vdb.upsert(data_for_vdb) + + return knwoledge_graph_inst + +async def local_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +) -> str: + use_model_func = global_config["llm_model_func"] + + kw_prompt_temp = PROMPTS["keywords_extraction"] + kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) + + try: + keywords_data = json.loads(result) + keywords = keywords_data.get("low_level_keywords", []) + keywords = ', '.join(keywords) + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + context = await _build_local_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + sys_prompt_temp = PROMPTS["rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + +async def _build_local_query_context( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, +): + results = await entities_vdb.query(query, top_k=query_param.top_k) + if not len(results): + return None + node_datas = await asyncio.gather( + *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results] + ) + if not all([n is not None for n in node_datas]): + logger.warning("Some nodes are missing, maybe the storage is damaged") + node_degrees = await asyncio.gather( + *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results] + ) + node_datas = [ + {**n, "entity_name": k["entity_name"], "rank": d} + for k, n, d in zip(results, node_datas, node_degrees) + if n is not None + ] + use_text_units = await _find_most_related_text_unit_from_entities( + node_datas, query_param, text_chunks_db, knowledge_graph_inst + ) + use_relations = await _find_most_related_edges_from_entities( + node_datas, query_param, knowledge_graph_inst + ) + logger.info( + f"Local query uses {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} text units" + ) + entites_section_list = [["id", "entity", "type", "description", "rank"]] + for i, n in enumerate(node_datas): + entites_section_list.append( + [ + i, + n["entity_name"], + n.get("entity_type", "UNKNOWN"), + n.get("description", "UNKNOWN"), + n["rank"], + ] + ) + entities_context = list_of_list_to_csv(entites_section_list) + + relations_section_list = [ + ["id", "source", "target", "description", "keywords", "weight", "rank"] + ] + for i, e in enumerate(use_relations): + relations_section_list.append( + [ + i, + e["src_tgt"][0], + e["src_tgt"][1], + e["description"], + e["keywords"], + e["weight"], + e["rank"], + ] + ) + relations_context = list_of_list_to_csv(relations_section_list) + + text_units_section_list = [["id", "content"]] + for i, t in enumerate(use_text_units): + text_units_section_list.append([i, t["content"]]) + text_units_context = list_of_list_to_csv(text_units_section_list) + return f""" +-----Entities----- +```csv +{entities_context} +``` +-----Relationships----- +```csv +{relations_context} +``` +-----Sources----- +```csv +{text_units_context} +``` +""" + +async def _find_most_related_text_unit_from_entities( + node_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, +): + text_units = [ + split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) + for dp in node_datas + ] + edges = await asyncio.gather( + *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] + ) + all_one_hop_nodes = set() + for this_edges in edges: + if not this_edges: + continue + all_one_hop_nodes.update([e[1] for e in this_edges]) + all_one_hop_nodes = list(all_one_hop_nodes) + all_one_hop_nodes_data = await asyncio.gather( + *[knowledge_graph_inst.get_node(e) for e in all_one_hop_nodes] + ) + all_one_hop_text_units_lookup = { + k: set(split_string_by_multi_markers(v["source_id"], [GRAPH_FIELD_SEP])) + for k, v in zip(all_one_hop_nodes, all_one_hop_nodes_data) + if v is not None + } + all_text_units_lookup = {} + for index, (this_text_units, this_edges) in enumerate(zip(text_units, edges)): + for c_id in this_text_units: + if c_id in all_text_units_lookup: + continue + relation_counts = 0 + for e in this_edges: + if ( + e[1] in all_one_hop_text_units_lookup + and c_id in all_one_hop_text_units_lookup[e[1]] + ): + relation_counts += 1 + all_text_units_lookup[c_id] = { + "data": await text_chunks_db.get_by_id(c_id), + "order": index, + "relation_counts": relation_counts, + } + if any([v is None for v in all_text_units_lookup.values()]): + logger.warning("Text chunks are missing, maybe the storage is damaged") + all_text_units = [ + {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None + ] + all_text_units = sorted( + all_text_units, key=lambda x: (x["order"], -x["relation_counts"]) + ) + all_text_units = truncate_list_by_token_size( + all_text_units, + key=lambda x: x["data"]["content"], + max_token_size=query_param.max_token_for_text_unit, + ) + all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] + return all_text_units + +async def _find_most_related_edges_from_entities( + node_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, +): + all_related_edges = await asyncio.gather( + *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] + ) + all_edges = set() + for this_edges in all_related_edges: + all_edges.update([tuple(sorted(e)) for e in this_edges]) + all_edges = list(all_edges) + all_edges_pack = await asyncio.gather( + *[knowledge_graph_inst.get_edge(e[0], e[1]) for e in all_edges] + ) + all_edges_degree = await asyncio.gather( + *[knowledge_graph_inst.edge_degree(e[0], e[1]) for e in all_edges] + ) + all_edges_data = [ + {"src_tgt": k, "rank": d, **v} + for k, v, d in zip(all_edges, all_edges_pack, all_edges_degree) + if v is not None + ] + all_edges_data = sorted( + all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True + ) + all_edges_data = truncate_list_by_token_size( + all_edges_data, + key=lambda x: x["description"], + max_token_size=query_param.max_token_for_global_context, + ) + return all_edges_data + +async def global_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +) -> str: + use_model_func = global_config["llm_model_func"] + + kw_prompt_temp = PROMPTS["keywords_extraction"] + kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) + + try: + keywords_data = json.loads(result) + keywords = keywords_data.get("high_level_keywords", []) + keywords = ', '.join(keywords) + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + context = await _build_global_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) + + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + + sys_prompt_temp = PROMPTS["rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + +async def _build_global_query_context( + keywords, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, +): + results = await relationships_vdb.query(keywords, top_k=query_param.top_k) + + if not len(results): + return None + + edge_datas = await asyncio.gather( + *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] + ) + + if not all([n is not None for n in edge_datas]): + logger.warning("Some edges are missing, maybe the storage is damaged") + edge_degree = await asyncio.gather( + *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results] + ) + edge_datas = [ + {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v} + for k, v, d in zip(results, edge_datas, edge_degree) + if v is not None + ] + edge_datas = sorted( + edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True + ) + edge_datas = truncate_list_by_token_size( + edge_datas, + key=lambda x: x["description"], + max_token_size=query_param.max_token_for_global_context, + ) + + use_entities = await _find_most_related_entities_from_relationships( + edge_datas, query_param, knowledge_graph_inst + ) + use_text_units = await _find_related_text_unit_from_relationships( + edge_datas, query_param, text_chunks_db, knowledge_graph_inst + ) + logger.info( + f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} text units" + ) + relations_section_list = [ + ["id", "source", "target", "description", "keywords", "weight", "rank"] + ] + for i, e in enumerate(edge_datas): + relations_section_list.append( + [ + i, + e["src_id"], + e["tgt_id"], + e["description"], + e["keywords"], + e["weight"], + e["rank"], + ] + ) + relations_context = list_of_list_to_csv(relations_section_list) + + entites_section_list = [["id", "entity", "type", "description", "rank"]] + for i, n in enumerate(use_entities): + entites_section_list.append( + [ + i, + n["entity_name"], + n.get("entity_type", "UNKNOWN"), + n.get("description", "UNKNOWN"), + n["rank"], + ] + ) + entities_context = list_of_list_to_csv(entites_section_list) + + text_units_section_list = [["id", "content"]] + for i, t in enumerate(use_text_units): + text_units_section_list.append([i, t["content"]]) + text_units_context = list_of_list_to_csv(text_units_section_list) + + return f""" +-----Entities----- +```csv +{entities_context} +``` +-----Relationships----- +```csv +{relations_context} +``` +-----Sources----- +```csv +{text_units_context} +``` +""" + +async def _find_most_related_entities_from_relationships( + edge_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, +): + entity_names = set() + for e in edge_datas: + entity_names.add(e["src_id"]) + entity_names.add(e["tgt_id"]) + + node_datas = await asyncio.gather( + *[knowledge_graph_inst.get_node(entity_name) for entity_name in entity_names] + ) + + node_degrees = await asyncio.gather( + *[knowledge_graph_inst.node_degree(entity_name) for entity_name in entity_names] + ) + node_datas = [ + {**n, "entity_name": k, "rank": d} + for k, n, d in zip(entity_names, node_datas, node_degrees) + ] + + node_datas = truncate_list_by_token_size( + node_datas, + key=lambda x: x["description"], + max_token_size=query_param.max_token_for_local_context, + ) + + return node_datas + +async def _find_related_text_unit_from_relationships( + edge_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, +): + + text_units = [ + split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) + for dp in edge_datas + ] + + all_text_units_lookup = {} + + for index, unit_list in enumerate(text_units): + for c_id in unit_list: + if c_id not in all_text_units_lookup: + all_text_units_lookup[c_id] = { + "data": await text_chunks_db.get_by_id(c_id), + "order": index, + } + + if any([v is None for v in all_text_units_lookup.values()]): + logger.warning("Text chunks are missing, maybe the storage is damaged") + all_text_units = [ + {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None + ] + all_text_units = sorted( + all_text_units, key=lambda x: x["order"] + ) + all_text_units = truncate_list_by_token_size( + all_text_units, + key=lambda x: x["data"]["content"], + max_token_size=query_param.max_token_for_text_unit, + ) + all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] + + return all_text_units + +async def hybird_query( + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +) -> str: + use_model_func = global_config["llm_model_func"] + + kw_prompt_temp = PROMPTS["keywords_extraction"] + kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) + + try: + keywords_data = json.loads(result) + hl_keywords = keywords_data.get("high_level_keywords", []) + ll_keywords = keywords_data.get("low_level_keywords", []) + hl_keywords = ', '.join(hl_keywords) + ll_keywords = ', '.join(ll_keywords) + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + low_level_context = await _build_local_query_context( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + + high_level_context = await _build_global_query_context( + hl_keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) + + context = combine_contexts(high_level_context, low_level_context) + + if query_param.only_need_context: + return context + if context is None: + return PROMPTS["fail_response"] + + sys_prompt_temp = PROMPTS["rag_response"] + sys_prompt = sys_prompt_temp.format( + context_data=context, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + +def combine_contexts(high_level_context, low_level_context): + # Function to extract entities, relationships, and sources from context strings + def extract_sections(context): + entities_match = re.search(r'-----Entities-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) + relationships_match = re.search(r'-----Relationships-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) + sources_match = re.search(r'-----Sources-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) + + entities = entities_match.group(1) if entities_match else '' + relationships = relationships_match.group(1) if relationships_match else '' + sources = sources_match.group(1) if sources_match else '' + + return entities, relationships, sources + + # Extract sections from both contexts + hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) + ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) + + # Combine and deduplicate the entities + combined_entities_set = set(filter(None, hl_entities.strip().split('\n') + ll_entities.strip().split('\n'))) + combined_entities = '\n'.join(combined_entities_set) + + # Combine and deduplicate the relationships + combined_relationships_set = set(filter(None, hl_relationships.strip().split('\n') + ll_relationships.strip().split('\n'))) + combined_relationships = '\n'.join(combined_relationships_set) + + # Combine and deduplicate the sources + combined_sources_set = set(filter(None, hl_sources.strip().split('\n') + ll_sources.strip().split('\n'))) + combined_sources = '\n'.join(combined_sources_set) + + # Format the combined context + return f""" +-----Entities----- +```csv +{combined_entities} +-----Relationships----- +{combined_relationships} +-----Sources----- +{combined_sources} +""" + +async def naive_query( + query, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, +): + use_model_func = global_config["llm_model_func"] + results = await chunks_vdb.query(query, top_k=query_param.top_k) + if not len(results): + return PROMPTS["fail_response"] + chunks_ids = [r["id"] for r in results] + chunks = await text_chunks_db.get_by_ids(chunks_ids) + + maybe_trun_chunks = truncate_list_by_token_size( + chunks, + key=lambda x: x["content"], + max_token_size=query_param.max_token_for_text_unit, + ) + logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks") + section = "--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks]) + if query_param.only_need_context: + return section + sys_prompt_temp = PROMPTS["naive_rag_response"] + sys_prompt = sys_prompt_temp.format( + content_data=section, response_type=query_param.response_type + ) + response = await use_model_func( + query, + system_prompt=sys_prompt, + ) + return response + diff --git a/lightrag/prompt.py b/lightrag/prompt.py new file mode 100644 index 000000000..5d28e49c5 --- /dev/null +++ b/lightrag/prompt.py @@ -0,0 +1,256 @@ +GRAPH_FIELD_SEP = "" + +PROMPTS = {} + +PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" +PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" +PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" +PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] + +PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] + +PROMPTS[ + "entity_extraction" +] = """-Goal- +Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. + +-Steps- +1. Identify all entities. For each identified entity, extract the following information: +- entity_name: Name of the entity, capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_description: Comprehensive description of the entity's attributes and activities +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity +- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +###################### +-Examples- +###################### +Example 1: + +Entity_types: [person, technology, mission, organization, location] +Text: +while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. + +Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” + +The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. + +It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths +################ +Output: +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} +("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} +("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} +("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} +("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} +############################# +Example 2: + +Entity_types: [person, technology, mission, organization, location] +Text: +They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. + +Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. + +Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly +############# +Output: +("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} +("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} +("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} +("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} +############################# +Example 3: + +Entity_types: [person, role, technology, organization, event, location, concept] +Text: +their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. + +"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." + +Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." + +Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. + +The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation +############# +Output: +("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} +("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} +("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} +("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} +("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} +("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} +("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} +("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +PROMPTS[ + "summarize_entity_descriptions" +] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +PROMPTS[ + "entiti_continue_extraction" +] = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +PROMPTS[ + "entiti_if_loop_extraction" +] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." + +PROMPTS[ + "rag_response" +] = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + + +---Data tables--- + +{context_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Do not include information where the supporting evidence for it is not provided. + + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +PROMPTS["keywords_extraction"] = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. + +---Goal--- + +Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. + +---Instructions--- + +- Output the keywords in JSON format. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or details. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does international trade influence global economic stability?" +################ +Output: +{{ + "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], + "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] +}} +############################# +Example 2: + +Query: "What are the environmental consequences of deforestation on biodiversity?" +################ +Output: +{{ + "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], + "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] +}} +############################# +Example 3: + +Query: "What is the role of education in reducing poverty?" +################ +Output: +{{ + "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], + "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: + +""" + +PROMPTS[ + "naive_rag_response" +] = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/storage.py b/lightrag/storage.py new file mode 100644 index 000000000..2f2bb7d8f --- /dev/null +++ b/lightrag/storage.py @@ -0,0 +1,246 @@ +import asyncio +import html +import json +import os +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Union, cast +import pickle +import hnswlib +import networkx as nx +import numpy as np +from nano_vectordb import NanoVectorDB +import xxhash + +from .utils import load_json, logger, write_json +from .base import ( + BaseGraphStorage, + BaseKVStorage, + BaseVectorStorage, +) + +@dataclass +class JsonKVStorage(BaseKVStorage): + def __post_init__(self): + working_dir = self.global_config["working_dir"] + self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") + self._data = load_json(self._file_name) or {} + logger.info(f"Load KV {self.namespace} with {len(self._data)} data") + + async def all_keys(self) -> list[str]: + return list(self._data.keys()) + + async def index_done_callback(self): + write_json(self._data, self._file_name) + + async def get_by_id(self, id): + return self._data.get(id, None) + + async def get_by_ids(self, ids, fields=None): + if fields is None: + return [self._data.get(id, None) for id in ids] + return [ + ( + {k: v for k, v in self._data[id].items() if k in fields} + if self._data.get(id, None) + else None + ) + for id in ids + ] + + async def filter_keys(self, data: list[str]) -> set[str]: + return set([s for s in data if s not in self._data]) + + async def upsert(self, data: dict[str, dict]): + left_data = {k: v for k, v in data.items() if k not in self._data} + self._data.update(left_data) + return left_data + + async def drop(self): + self._data = {} + +@dataclass +class NanoVectorDBStorage(BaseVectorStorage): + cosine_better_than_threshold: float = 0.2 + + def __post_init__(self): + + self._client_file_name = os.path.join( + self.global_config["working_dir"], f"vdb_{self.namespace}.json" + ) + self._max_batch_size = self.global_config["embedding_batch_num"] + self._client = NanoVectorDB( + self.embedding_func.embedding_dim, storage_file=self._client_file_name + ) + self.cosine_better_than_threshold = self.global_config.get( + "cosine_better_than_threshold", self.cosine_better_than_threshold + ) + + async def upsert(self, data: dict[str, dict]): + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + if not len(data): + logger.warning("You insert an empty data to vector DB") + return [] + list_data = [ + { + "__id__": k, + **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, + } + for k, v in data.items() + ] + contents = [v["content"] for v in data.values()] + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + embeddings_list = await asyncio.gather( + *[self.embedding_func(batch) for batch in batches] + ) + embeddings = np.concatenate(embeddings_list) + for i, d in enumerate(list_data): + d["__vector__"] = embeddings[i] + results = self._client.upsert(datas=list_data) + return results + + async def query(self, query: str, top_k=5): + embedding = await self.embedding_func([query]) + embedding = embedding[0] + results = self._client.query( + query=embedding, + top_k=top_k, + better_than_threshold=self.cosine_better_than_threshold, + ) + results = [ + {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results + ] + return results + + async def index_done_callback(self): + self._client.save() + +@dataclass +class NetworkXStorage(BaseGraphStorage): + @staticmethod + def load_nx_graph(file_name) -> nx.Graph: + if os.path.exists(file_name): + return nx.read_graphml(file_name) + return None + + @staticmethod + def write_nx_graph(graph: nx.Graph, file_name): + logger.info( + f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges" + ) + nx.write_graphml(graph, file_name) + + @staticmethod + def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Return the largest connected component of the graph, with nodes and edges sorted in a stable way. + """ + from graspologic.utils import largest_connected_component + + graph = graph.copy() + graph = cast(nx.Graph, largest_connected_component(graph)) + node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore + graph = nx.relabel_nodes(graph, node_mapping) + return NetworkXStorage._stabilize_graph(graph) + + @staticmethod + def _stabilize_graph(graph: nx.Graph) -> nx.Graph: + """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py + Ensure an undirected graph with the same relationships will always be read the same way. + """ + fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph() + + sorted_nodes = graph.nodes(data=True) + sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0]) + + fixed_graph.add_nodes_from(sorted_nodes) + edges = list(graph.edges(data=True)) + + if not graph.is_directed(): + + def _sort_source_target(edge): + source, target, edge_data = edge + if source > target: + temp = source + source = target + target = temp + return source, target, edge_data + + edges = [_sort_source_target(edge) for edge in edges] + + def _get_edge_key(source: Any, target: Any) -> str: + return f"{source} -> {target}" + + edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1])) + + fixed_graph.add_edges_from(edges) + return fixed_graph + + def __post_init__(self): + self._graphml_xml_file = os.path.join( + self.global_config["working_dir"], f"graph_{self.namespace}.graphml" + ) + preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file) + if preloaded_graph is not None: + logger.info( + f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges" + ) + self._graph = preloaded_graph or nx.Graph() + self._node_embed_algorithms = { + "node2vec": self._node2vec_embed, + } + + async def index_done_callback(self): + NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file) + + async def has_node(self, node_id: str) -> bool: + return self._graph.has_node(node_id) + + async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: + return self._graph.has_edge(source_node_id, target_node_id) + + async def get_node(self, node_id: str) -> Union[dict, None]: + return self._graph.nodes.get(node_id) + + async def node_degree(self, node_id: str) -> int: + return self._graph.degree(node_id) + + async def edge_degree(self, src_id: str, tgt_id: str) -> int: + return self._graph.degree(src_id) + self._graph.degree(tgt_id) + + async def get_edge( + self, source_node_id: str, target_node_id: str + ) -> Union[dict, None]: + return self._graph.edges.get((source_node_id, target_node_id)) + + async def get_node_edges(self, source_node_id: str): + if self._graph.has_node(source_node_id): + return list(self._graph.edges(source_node_id)) + return None + + async def upsert_node(self, node_id: str, node_data: dict[str, str]): + self._graph.add_node(node_id, **node_data) + + async def upsert_edge( + self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] + ): + self._graph.add_edge(source_node_id, target_node_id, **edge_data) + + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: + if algorithm not in self._node_embed_algorithms: + raise ValueError(f"Node embedding algorithm {algorithm} not supported") + return await self._node_embed_algorithms[algorithm]() + + async def _node2vec_embed(self): + from graspologic import embed + + embeddings, nodes = embed.node2vec_embed( + self._graph, + **self.global_config["node2vec_params"], + ) + + nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] + return embeddings, nodes_ids diff --git a/lightrag/utils.py b/lightrag/utils.py new file mode 100644 index 000000000..c75b4270c --- /dev/null +++ b/lightrag/utils.py @@ -0,0 +1,165 @@ +import asyncio +import html +import json +import logging +import os +import re +from dataclasses import dataclass +from functools import wraps +from hashlib import md5 +from typing import Any, Union + +import numpy as np +import tiktoken + +ENCODER = None + +logger = logging.getLogger("lightrag") + +def set_logger(log_file: str): + logger.setLevel(logging.DEBUG) + + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.DEBUG) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + + if not logger.handlers: + logger.addHandler(file_handler) + +@dataclass +class EmbeddingFunc: + embedding_dim: int + max_token_size: int + func: callable + + async def __call__(self, *args, **kwargs) -> np.ndarray: + return await self.func(*args, **kwargs) + +def locate_json_string_body_from_string(content: str) -> Union[str, None]: + """Locate the JSON string body from a string""" + maybe_json_str = re.search(r"{.*}", content, re.DOTALL) + if maybe_json_str is not None: + return maybe_json_str.group(0) + else: + return None + +def convert_response_to_json(response: str) -> dict: + json_str = locate_json_string_body_from_string(response) + assert json_str is not None, f"Unable to parse JSON from response: {response}" + try: + data = json.loads(json_str) + return data + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON: {json_str}") + raise e from None + +def compute_args_hash(*args): + return md5(str(args).encode()).hexdigest() + +def compute_mdhash_id(content, prefix: str = ""): + return prefix + md5(content.encode()).hexdigest() + +def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): + """Add restriction of maximum async calling times for a async func""" + + def final_decro(func): + """Not using async.Semaphore to aovid use nest-asyncio""" + __current_size = 0 + + @wraps(func) + async def wait_func(*args, **kwargs): + nonlocal __current_size + while __current_size >= max_size: + await asyncio.sleep(waitting_time) + __current_size += 1 + result = await func(*args, **kwargs) + __current_size -= 1 + return result + + return wait_func + + return final_decro + +def wrap_embedding_func_with_attrs(**kwargs): + """Wrap a function with attributes""" + + def final_decro(func) -> EmbeddingFunc: + new_func = EmbeddingFunc(**kwargs, func=func) + return new_func + + return final_decro + +def load_json(file_name): + if not os.path.exists(file_name): + return None + with open(file_name) as f: + return json.load(f) + +def write_json(json_obj, file_name): + with open(file_name, "w") as f: + json.dump(json_obj, f, indent=2, ensure_ascii=False) + +def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): + global ENCODER + if ENCODER is None: + ENCODER = tiktoken.encoding_for_model(model_name) + tokens = ENCODER.encode(content) + return tokens + + +def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): + global ENCODER + if ENCODER is None: + ENCODER = tiktoken.encoding_for_model(model_name) + content = ENCODER.decode(tokens) + return content + +def pack_user_ass_to_openai_messages(*args: str): + roles = ["user", "assistant"] + return [ + {"role": roles[i % 2], "content": content} for i, content in enumerate(args) + ] + +def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: + """Split a string by multiple markers""" + if not markers: + return [content] + results = re.split("|".join(re.escape(marker) for marker in markers), content) + return [r.strip() for r in results if r.strip()] + +# Refer the utils functions of the official GraphRAG implementation: +# https://github.com/microsoft/graphrag +def clean_str(input: Any) -> str: + """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" + # If we get non-string input, just give it back + if not isinstance(input, str): + return input + + result = html.unescape(input.strip()) + # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python + return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + +def is_float_regex(value): + return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + +def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): + """Truncate a list of data by token size""" + if max_token_size <= 0: + return [] + tokens = 0 + for i, data in enumerate(list_data): + tokens += len(encode_string_by_tiktoken(key(data))) + if tokens > max_token_size: + return list_data[:i] + return list_data + +def list_of_list_to_csv(data: list[list]): + return "\n".join( + [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] + ) + +def save_data_to_file(data, file_name): + with open(file_name, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file From 2425ede64fff575c89c58b85bd0ca1b695320fd3 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 10 Oct 2024 15:17:03 +0800 Subject: [PATCH 11/67] update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 42de1c1cb..60b655abf 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ Output your evaluation in the following JSON format: ``` ## Citation -``` +```python @article{guo2024lightrag, title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, From bf84cf18a11006cb1767c550d84f4b393dfb32a8 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 11:22:17 +0800 Subject: [PATCH 12/67] update utils.py --- lightrag/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index c75b4270c..9496cf34b 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -94,11 +94,11 @@ def final_decro(func) -> EmbeddingFunc: def load_json(file_name): if not os.path.exists(file_name): return None - with open(file_name) as f: + with open(file_name, encoding="utf-8") as f: return json.load(f) def write_json(json_obj, file_name): - with open(file_name, "w") as f: + with open(file_name, "w", encoding="utf-8") as f: json.dump(json_obj, f, indent=2, ensure_ascii=False) def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): From 2d2085e6f92ac636a58e014c56df6f65319fd313 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 11:24:42 +0800 Subject: [PATCH 13/67] update utils.py --- lightrag/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index c75b4270c..9496cf34b 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -94,11 +94,11 @@ def final_decro(func) -> EmbeddingFunc: def load_json(file_name): if not os.path.exists(file_name): return None - with open(file_name) as f: + with open(file_name, encoding="utf-8") as f: return json.load(f) def write_json(json_obj, file_name): - with open(file_name, "w") as f: + with open(file_name, "w", encoding="utf-8") as f: json.dump(json_obj, f, indent=2, ensure_ascii=False) def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): From 2e364fb665bd2dacfa355c29b1a584deaa0bbe47 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 11:39:31 +0800 Subject: [PATCH 14/67] update __init__.py --- lightrag/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index dc497cd44..0b279096a 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG, QueryParam -__version__ = "0.0.2" +__version__ = "0.0.3" __author__ = "Zirui Guo" -__url__ = "https://github.com/HKUDS/GraphEdit" +__url__ = "https://github.com/HKUDS/LightRAG" From 050ff77dafb78c5ad8d68945926cc0bd2407308c Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 11:43:54 +0800 Subject: [PATCH 15/67] update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 60b655abf..9c588804f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ + + This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) From 4011aaf8be69fbf7a7d8eb90708d8cfba2d71682 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 11:46:57 +0800 Subject: [PATCH 16/67] update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9c588804f..ee5535e67 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # LightRAG: Simple and Fast Retrieval-Augmented Generation -![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) +
+![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) @@ -10,6 +11,8 @@ This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) +
+ ## Install * Install from source From ea8019d1b89a95b1806edfc60beb0aeb62ed2bde Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:47:53 +0800 Subject: [PATCH 17/67] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ee5535e67..1fada2611 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # LightRAG: Simple and Fast Retrieval-Augmented Generation -
-![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) +![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) +
From 7df403557951de2dd1943cf359e3797d6a6b1f98 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:59:51 +0800 Subject: [PATCH 18/67] Update README.md --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1fada2611..693f60cf2 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,18 @@ ![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg)
- - - - - +

+ + + + +

+

+ + + +

+ This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png)
From 6713017df9e070414449a01d055b92587c2a43df Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 15:16:43 +0800 Subject: [PATCH 19/67] update reproduce --- README.md | 114 +++++++++++++++++++++++++++++++++++++++++++- reproduce/Step_0.py | 63 ++++++++++++++++++++++++ reproduce/Step_1.py | 32 +++++++++++++ reproduce/Step_2.py | 76 +++++++++++++++++++++++++++++ reproduce/Step_3.py | 62 ++++++++++++++++++++++++ 5 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 reproduce/Step_0.py create mode 100644 reproduce/Step_1.py create mode 100644 reproduce/Step_2.py create mode 100644 reproduce/Step_3.py diff --git a/README.md b/README.md index 693f60cf2..f70b9d58a 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,6 @@ Output your evaluation in the following JSON format: }} ``` ### Overall Performance Table -### Overall Performance Table | | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | |----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| | | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | @@ -173,6 +172,114 @@ Output your evaluation in the following JSON format: | **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | | **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | +## Reproduce +All the code can be found in the `./reproduce` directory. +### Step-0 Extract Unique Contexts +First, we need to extract unique contexts in the datasets. +```python +def extract_unique_contexts(input_directory, output_directory): + + os.makedirs(output_directory, exist_ok=True) + + jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl')) + print(f"Found {len(jsonl_files)} JSONL files.") + + for file_path in jsonl_files: + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + output_filename = f"{name}_unique_contexts.json" + output_path = os.path.join(output_directory, output_filename) + + unique_contexts_dict = {} + + print(f"Processing file: {filename}") + + try: + with open(file_path, 'r', encoding='utf-8') as infile: + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + try: + json_obj = json.loads(line) + context = json_obj.get('context') + if context and context not in unique_contexts_dict: + unique_contexts_dict[context] = None + except json.JSONDecodeError as e: + print(f"JSON decoding error in file {filename} at line {line_number}: {e}") + except FileNotFoundError: + print(f"File not found: {filename}") + continue + except Exception as e: + print(f"An error occurred while processing file {filename}: {e}") + continue + + unique_contexts_list = list(unique_contexts_dict.keys()) + print(f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.") + + try: + with open(output_path, 'w', encoding='utf-8') as outfile: + json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) + print(f"Unique `context` entries have been saved to: {output_filename}") + except Exception as e: + print(f"An error occurred while saving to the file {output_filename}: {e}") + + print("All files have been processed.") + +``` +### Step-1 Insert Contexts +For the extracted contexts, we insert them into the LightRAG system. + +```python +def insert_text(rag, file_path): + with open(file_path, mode='r') as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") +``` +### Step-2 Generate Queries + +We extract tokens from both the first half and the second half of each context in the dataset, then combine them to generate queries for dataset descriptions. +```python +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +def get_summary(context, tot_tokens=2000): + tokens = tokenizer.tokenize(context) + half_tokens = tot_tokens // 2 + + start_tokens = tokens[1000:1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens):1000] + + summary_tokens = start_tokens + end_tokens + summary = tokenizer.convert_tokens_to_string(summary_tokens) + + return summary +``` + +### Step-3 Query +For the queries generated in Step-2, we will extract them and query LightRAG. +```python +def extract_queries(file_path): + with open(file_path, 'r') as f: + data = f.read() + + data = data.replace('**', '') + + queries = re.findall(r'- Question \d+: (.+)', data) + + return queries +``` ## Code Structure ```python @@ -191,6 +298,11 @@ Output your evaluation in the following JSON format: │ ├── prompt.py │ ├── storage.py │ └── utils.jpeg +├── reproduce +│ ├── Step_0.py +│ ├── Step_1.py +│ ├── Step_2.py +│ └── Step_3.py ├── LICENSE ├── README.md ├── requirements.txt diff --git a/reproduce/Step_0.py b/reproduce/Step_0.py new file mode 100644 index 000000000..9053aa40e --- /dev/null +++ b/reproduce/Step_0.py @@ -0,0 +1,63 @@ +import os +import json +import glob +import argparse + +def extract_unique_contexts(input_directory, output_directory): + + os.makedirs(output_directory, exist_ok=True) + + jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl')) + print(f"Found {len(jsonl_files)} JSONL files.") + + for file_path in jsonl_files: + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + output_filename = f"{name}_unique_contexts.json" + output_path = os.path.join(output_directory, output_filename) + + unique_contexts_dict = {} + + print(f"Processing file: {filename}") + + try: + with open(file_path, 'r', encoding='utf-8') as infile: + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + try: + json_obj = json.loads(line) + context = json_obj.get('context') + if context and context not in unique_contexts_dict: + unique_contexts_dict[context] = None + except json.JSONDecodeError as e: + print(f"JSON decoding error in file {filename} at line {line_number}: {e}") + except FileNotFoundError: + print(f"File not found: {filename}") + continue + except Exception as e: + print(f"An error occurred while processing file {filename}: {e}") + continue + + unique_contexts_list = list(unique_contexts_dict.keys()) + print(f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.") + + try: + with open(output_path, 'w', encoding='utf-8') as outfile: + json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) + print(f"Unique `context` entries have been saved to: {output_filename}") + except Exception as e: + print(f"An error occurred while saving to the file {output_filename}: {e}") + + print("All files have been processed.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input_dir', type=str, default='../datasets') + parser.add_argument('-o', '--output_dir', type=str, default='../datasets/unique_contexts') + + args = parser.parse_args() + + extract_unique_contexts(args.input_dir, args.output_dir) diff --git a/reproduce/Step_1.py b/reproduce/Step_1.py new file mode 100644 index 000000000..08e497cbd --- /dev/null +++ b/reproduce/Step_1.py @@ -0,0 +1,32 @@ +import os +import json +import time + +from lightrag import LightRAG + +def insert_text(rag, file_path): + with open(file_path, mode='r') as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") + +cls = "agriculture" +WORKING_DIR = "../{cls}" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG(working_dir=WORKING_DIR) + +insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") \ No newline at end of file diff --git a/reproduce/Step_2.py b/reproduce/Step_2.py new file mode 100644 index 000000000..b00c19b8e --- /dev/null +++ b/reproduce/Step_2.py @@ -0,0 +1,76 @@ +import os +import json +from openai import OpenAI +from transformers import GPT2Tokenizer + +def openai_complete_if_cache( + model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_client = OpenAI() + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + response = openai_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + return response.choices[0].message.content + +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +def get_summary(context, tot_tokens=2000): + tokens = tokenizer.tokenize(context) + half_tokens = tot_tokens // 2 + + start_tokens = tokens[1000:1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens):1000] + + summary_tokens = start_tokens + end_tokens + summary = tokenizer.convert_tokens_to_string(summary_tokens) + + return summary + + +clses = ['agriculture'] +for cls in clses: + with open(f'../datasets/unique_contexts/{cls}_unique_contexts.json', mode='r') as f: + unique_contexts = json.load(f) + + summaries = [get_summary(context) for context in unique_contexts] + + total_description = "\n\n".join(summaries) + + prompt = f""" + Given the following description of a dataset: + + {total_description} + + Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. + + Output the results in the following structure: + - User 1: [user description] + - Task 1: [task description] + - Question 1: + - Question 2: + - Question 3: + - Question 4: + - Question 5: + - Task 2: [task description] + ... + - Task 5: [task description] + - User 2: [user description] + ... + - User 5: [user description] + ... + """ + + result = openai_complete_if_cache(model='gpt-4o', prompt=prompt) + + file_path = f"../datasets/questions/{cls}_questions.txt" + with open(file_path, "w") as file: + file.write(result) + + print(f"{cls}_questions written to {file_path}") \ No newline at end of file diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py new file mode 100644 index 000000000..f7f7ee300 --- /dev/null +++ b/reproduce/Step_3.py @@ -0,0 +1,62 @@ +import re +import json +import asyncio +from lightrag import LightRAG, QueryParam +from tqdm import tqdm + +def extract_queries(file_path): + with open(file_path, 'r') as f: + data = f.read() + + data = data.replace('**', '') + + queries = re.findall(r'- Question \d+: (.+)', data) + + return queries + +async def process_query(query_text, rag_instance, query_param): + try: + result, context = await rag_instance.aquery(query_text, param=query_param) + return {"query": query_text, "result": result, "context": context}, None + except Exception as e: + return None, {"query": query_text, "error": str(e)} + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + +def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file, error_file): + loop = always_get_an_event_loop() + + with open(output_file, 'a', encoding='utf-8') as result_file, open(error_file, 'a', encoding='utf-8') as err_file: + result_file.write("[\n") + first_entry = True + + for query_text in tqdm(queries, desc="Processing queries", unit="query"): + result, error = loop.run_until_complete(process_query(query_text, rag_instance, query_param)) + + if result: + if not first_entry: + result_file.write(",\n") + json.dump(result, result_file, ensure_ascii=False, indent=4) + first_entry = False + elif error: + json.dump(error, err_file, ensure_ascii=False, indent=4) + err_file.write("\n") + + result_file.write("\n]") + +if __name__ == "__main__": + cls = "agriculture" + mode = "hybird" + WORKING_DIR = "../{cls}" + + rag = LightRAG(working_dir=WORKING_DIR) + query_param = QueryParam(mode=mode) + + queries = extract_queries(f"../datasets/questions/{cls}_questions.txt") + run_queries_and_save_to_json(queries, rag, query_param, "result.json", "errors.json") \ No newline at end of file From de9ff79a29ecc0d4644a967621c50f046cec2b15 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 15:19:20 +0800 Subject: [PATCH 20/67] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f70b9d58a..6d837c244 100644 --- a/README.md +++ b/README.md @@ -250,7 +250,7 @@ def insert_text(rag, file_path): ``` ### Step-2 Generate Queries -We extract tokens from both the first half and the second half of each context in the dataset, then combine them to generate queries for dataset descriptions. +We extract tokens from both the first half and the second half of each context in the dataset, then combine them as the dataset description to generate queries. ```python tokenizer = GPT2Tokenizer.from_pretrained('gpt2') From b92e8c9df4d169f6b081c724ec164a412e1ad43f Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:44:00 +0800 Subject: [PATCH 21/67] Update README.md From 5b29ebdebcda92910187eed415dbfe28ee9e6102 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:46:51 +0800 Subject: [PATCH 22/67] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6d837c244..ee70cffba 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@

- - + +

This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). From 837dcf53e605b018a4e752a6899f5b60cd3940b2 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:49:16 +0800 Subject: [PATCH 23/67] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ee70cffba..da5f7ebfe 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,10 @@

-

+

- - + +

This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). From a115b4b8ce1e18427432e80bf0f45762397ee1ac Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Sat, 12 Oct 2024 18:13:33 +0800 Subject: [PATCH 24/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index da5f7ebfe..5d8734e82 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@

- + From 622ffd9e829c7f7157f4cf1995604d7b02e808ef Mon Sep 17 00:00:00 2001 From: HeAndres <69391549+HeAndres@users.noreply.github.com> Date: Sun, 13 Oct 2024 01:31:37 +0200 Subject: [PATCH 25/67] Fix typo on readme: utils.jpeg -> utils.py --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d8734e82..1541e1531 100644 --- a/README.md +++ b/README.md @@ -297,7 +297,7 @@ def extract_queries(file_path): │ ├── operate.py │ ├── prompt.py │ ├── storage.py -│ └── utils.jpeg +│ └── utils.py ├── reproduce │ ├── Step_0.py │ ├── Step_1.py From c89fdc4d631dcfbe1b8b3912e2f69520de8d4399 Mon Sep 17 00:00:00 2001 From: TianyuFan0504 Date: Mon, 14 Oct 2024 19:41:07 +0800 Subject: [PATCH 26/67] Add HF Support --- lightrag/lightrag.py | 7 ++-- lightrag/llm.py | 84 ++++++++++++++++++++++++++++++++++++++++++-- lightrag/operate.py | 76 +++++++++++++++++++++++++++++++-------- 3 files changed, 148 insertions(+), 19 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 836fda9ec..25199888f 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -5,7 +5,7 @@ from functools import partial from typing import Type, cast -from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding +from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding,hf_model,hf_embedding from .operate import ( chunking_by_token_size, extract_entities, @@ -77,12 +77,13 @@ class LightRAG: ) # text embedding - embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) + embedding_func: EmbeddingFunc = field(default_factory=lambda: hf_embedding)#openai_embedding embedding_batch_num: int = 32 embedding_func_max_async: int = 16 # LLM - llm_model_func: callable = gpt_4o_mini_complete + llm_model_func: callable = hf_model#gpt_4o_mini_complete + llm_model_name: str = 'meta-llama/Llama-3.2-1B-Instruct'#'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' llm_model_max_token_size: int = 32768 llm_model_max_async: int = 16 diff --git a/lightrag/llm.py b/lightrag/llm.py index ee700a104..ac1471c1e 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -7,10 +7,12 @@ wait_exponential, retry_if_exception_type, ) - +from transformers import AutoModel,AutoTokenizer, AutoModelForCausalLM +import torch from .base import BaseKVStorage from .utils import compute_args_hash, wrap_embedding_func_with_attrs - +import copy +os.environ["TOKENIZERS_PARALLELISM"] = "false" @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), @@ -42,6 +44,52 @@ async def openai_complete_if_cache( ) return response.choices[0].message.content +async def hf_model_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + model_name = model + hf_tokenizer = AutoTokenizer.from_pretrained(model_name,device_map = 'auto') + if hf_tokenizer.pad_token == None: + # print("use eos token") + hf_tokenizer.pad_token = hf_tokenizer.eos_token + hf_model = AutoModelForCausalLM.from_pretrained(model_name,device_map = 'auto') + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + input_prompt = '' + try: + input_prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + except: + try: + ori_message = copy.deepcopy(messages) + if messages[0]['role'] == "system": + messages[1]['content'] = "" + messages[0]['content'] + "\n" + messages[1]['content'] + messages = messages[1:] + input_prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + except: + len_message = len(ori_message) + for msgid in range(len_message): + input_prompt =input_prompt+ '<'+ori_message[msgid]['role']+'>'+ori_message[msgid]['content']+'\n' + + input_ids = hf_tokenizer(input_prompt, return_tensors='pt', padding=True, truncation=True).to("cuda") + output = hf_model.generate(**input_ids, max_new_tokens=200, num_return_sequences=1,early_stopping = True) + response_text = hf_tokenizer.decode(output[0], skip_special_tokens=True) + if hashing_kv is not None: + await hashing_kv.upsert( + {args_hash: {"return": response_text, "model": model}} + ) + return response_text + + async def gpt_4o_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -65,6 +113,20 @@ async def gpt_4o_mini_complete( **kwargs, ) + + +async def hf_model( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + input_string = kwargs.get('model_name', 'google/gemma-2-2b-it') + return await hf_model_if_cache( + input_string, + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) @retry( stop=stop_after_attempt(3), @@ -78,6 +140,24 @@ async def openai_embedding(texts: list[str]) -> np.ndarray: ) return np.array([dp.embedding for dp in response.data]) + + +global EMBED_MODEL +global tokenizer +EMBED_MODEL = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +@wrap_embedding_func_with_attrs( + embedding_dim=384, + max_token_size=5000, +) +async def hf_embedding(texts: list[str]) -> np.ndarray: + input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids + with torch.no_grad(): + outputs = EMBED_MODEL(input_ids) + embeddings = outputs.last_hidden_state.mean(dim=1) + return embeddings.detach().numpy() + + if __name__ == "__main__": import asyncio diff --git a/lightrag/operate.py b/lightrag/operate.py index 2d3271da8..21b914f9b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -3,7 +3,7 @@ import re from typing import Union from collections import Counter, defaultdict - +import warnings from .utils import ( logger, clean_str, @@ -398,10 +398,15 @@ async def local_query( keywords = keywords_data.get("low_level_keywords", []) keywords = ', '.join(keywords) except json.JSONDecodeError as e: + try: + result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip().strip('```').strip('json') + keywords_data = json.loads(result) + keywords = keywords_data.get("low_level_keywords", []) + keywords = ', '.join(keywords) # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - + except json.JSONDecodeError as e: + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] context = await _build_local_query_context( keywords, knowledge_graph_inst, @@ -421,6 +426,9 @@ async def local_query( query, system_prompt=sys_prompt, ) + if len(response)>len(sys_prompt): + response = response.replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() + return response async def _build_local_query_context( @@ -617,9 +625,16 @@ async def global_query( keywords = keywords_data.get("high_level_keywords", []) keywords = ', '.join(keywords) except json.JSONDecodeError as e: - # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] + try: + result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip().strip('```').strip('json') + keywords_data = json.loads(result) + keywords = keywords_data.get("high_level_keywords", []) + keywords = ', '.join(keywords) + + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] context = await _build_global_query_context( keywords, @@ -643,6 +658,9 @@ async def global_query( query, system_prompt=sys_prompt, ) + if len(response)>len(sys_prompt): + response = response.replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() + return response async def _build_global_query_context( @@ -822,8 +840,8 @@ async def hybird_query( kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) + result = await use_model_func(kw_prompt) - try: keywords_data = json.loads(result) hl_keywords = keywords_data.get("high_level_keywords", []) @@ -831,10 +849,18 @@ async def hybird_query( hl_keywords = ', '.join(hl_keywords) ll_keywords = ', '.join(ll_keywords) except json.JSONDecodeError as e: + try: + result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip().strip('```').strip('json') + keywords_data = json.loads(result) + hl_keywords = keywords_data.get("high_level_keywords", []) + ll_keywords = keywords_data.get("low_level_keywords", []) + hl_keywords = ', '.join(hl_keywords) + ll_keywords = ', '.join(ll_keywords) # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - + except json.JSONDecodeError as e: + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + low_level_context = await _build_local_query_context( ll_keywords, knowledge_graph_inst, @@ -851,7 +877,7 @@ async def hybird_query( text_chunks_db, query_param, ) - + context = combine_contexts(high_level_context, low_level_context) if query_param.only_need_context: @@ -867,10 +893,13 @@ async def hybird_query( query, system_prompt=sys_prompt, ) + if len(response)>len(sys_prompt): + response = response.replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() return response def combine_contexts(high_level_context, low_level_context): # Function to extract entities, relationships, and sources from context strings + def extract_sections(context): entities_match = re.search(r'-----Entities-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) relationships_match = re.search(r'-----Relationships-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) @@ -883,8 +912,21 @@ def extract_sections(context): return entities, relationships, sources # Extract sections from both contexts - hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) - ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) + + if high_level_context==None: + warnings.warn("High Level context is None. Return empty High entity/relationship/source") + hl_entities, hl_relationships, hl_sources = '','','' + else: + hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) + + + if low_level_context==None: + warnings.warn("Low Level context is None. Return empty Low entity/relationship/source") + ll_entities, ll_relationships, ll_sources = '','','' + else: + ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) + + # Combine and deduplicate the entities combined_entities_set = set(filter(None, hl_entities.strip().split('\n') + ll_entities.strip().split('\n'))) @@ -917,6 +959,7 @@ async def naive_query( global_config: dict, ): use_model_func = global_config["llm_model_func"] + use_model_name = global_config['llm_model_name'] results = await chunks_vdb.query(query, top_k=query_param.top_k) if not len(results): return PROMPTS["fail_response"] @@ -939,6 +982,11 @@ async def naive_query( response = await use_model_func( query, system_prompt=sys_prompt, + model_name = use_model_name ) + + if len(response)>len(sys_prompt): + response = response[len(sys_prompt):].replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() + return response From d47dd465cee957d6d8a90f5b5361acaf928eeb88 Mon Sep 17 00:00:00 2001 From: TianyuFan0504 Date: Mon, 14 Oct 2024 20:33:46 +0800 Subject: [PATCH 27/67] update hf_model_complete --- lightrag/lightrag.py | 7 ++++--- lightrag/llm.py | 4 ++-- lightrag/operate.py | 2 -- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 25199888f..9c34a6070 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -5,7 +5,7 @@ from functools import partial from typing import Type, cast -from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding,hf_model,hf_embedding +from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding,hf_model_complete,hf_embedding from .operate import ( chunking_by_token_size, extract_entities, @@ -77,12 +77,13 @@ class LightRAG: ) # text embedding - embedding_func: EmbeddingFunc = field(default_factory=lambda: hf_embedding)#openai_embedding + # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding) + embedding_func: EmbeddingFunc = field(default_factory=lambda:openai_embedding)# embedding_batch_num: int = 32 embedding_func_max_async: int = 16 # LLM - llm_model_func: callable = hf_model#gpt_4o_mini_complete + llm_model_func: callable = gpt_4o_mini_complete#hf_model_complete# llm_model_name: str = 'meta-llama/Llama-3.2-1B-Instruct'#'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' llm_model_max_token_size: int = 32768 llm_model_max_async: int = 16 diff --git a/lightrag/llm.py b/lightrag/llm.py index ac1471c1e..5fb27b045 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -115,10 +115,10 @@ async def gpt_4o_mini_complete( -async def hf_model( +async def hf_model_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: - input_string = kwargs.get('model_name', 'google/gemma-2-2b-it') + input_string = kwargs['hashing_kv'].global_config['llm_model_name'] return await hf_model_if_cache( input_string, prompt, diff --git a/lightrag/operate.py b/lightrag/operate.py index 21b914f9b..a8213a379 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -959,7 +959,6 @@ async def naive_query( global_config: dict, ): use_model_func = global_config["llm_model_func"] - use_model_name = global_config['llm_model_name'] results = await chunks_vdb.query(query, top_k=query_param.top_k) if not len(results): return PROMPTS["fail_response"] @@ -982,7 +981,6 @@ async def naive_query( response = await use_model_func( query, system_prompt=sys_prompt, - model_name = use_model_name ) if len(response)>len(sys_prompt): From ea126a7108135a6ecd7a3d52aa5f5620bd92dfaa Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 19:40:08 +0800 Subject: [PATCH 28/67] Add huggingface model support --- README.md | 8 +++---- examples/insert.py | 18 ---------------- examples/lightrag_hf_demo.py | 36 ++++++++++++++++++++++++++++++++ examples/lightrag_openai_demo.py | 33 +++++++++++++++++++++++++++++ examples/query.py | 16 -------------- lightrag/__init__.py | 2 +- lightrag/base.py | 2 +- lightrag/lightrag.py | 29 ++++++++++++++++++------- lightrag/llm.py | 8 ++----- lightrag/operate.py | 2 +- reproduce/Step_3.py | 2 +- 11 files changed, 100 insertions(+), 56 deletions(-) delete mode 100644 examples/insert.py create mode 100644 examples/lightrag_hf_demo.py create mode 100644 examples/lightrag_openai_demo.py delete mode 100644 examples/query.py diff --git a/README.md b/README.md index 1541e1531..562585e46 100644 --- a/README.md +++ b/README.md @@ -59,8 +59,8 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode= # Perform global search print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) -# Perform hybird search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybird"))) +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) ``` Batch Insert ```python @@ -287,8 +287,8 @@ def extract_queries(file_path): ├── examples │ ├── batch_eval.py │ ├── generate_query.py -│ ├── insert.py -│ └── query.py +│ ├── lightrag_openai_demo.py +│ └── lightrag_hf_demo.py ├── lightrag │ ├── __init__.py │ ├── base.py diff --git a/examples/insert.py b/examples/insert.py deleted file mode 100644 index 25c3cddac..000000000 --- a/examples/insert.py +++ /dev/null @@ -1,18 +0,0 @@ -import os -import sys - -from lightrag import LightRAG - -# os.environ["OPENAI_API_KEY"] = "" - -WORKING_DIR = "" - -if not os.path.exists(WORKING_DIR): - os.mkdir(WORKING_DIR) - -rag = LightRAG(working_dir=WORKING_DIR) - -with open('./text.txt', 'r') as f: - text = f.read() - -rag.insert(text) \ No newline at end of file diff --git a/examples/lightrag_hf_demo.py b/examples/lightrag_hf_demo.py new file mode 100644 index 000000000..f0e5fa99c --- /dev/null +++ b/examples/lightrag_hf_demo.py @@ -0,0 +1,36 @@ +import os +import sys + +from lightrag import LightRAG, QueryParam +from lightrag.llm import hf_model_complete, hf_embedding +from transformers import AutoModel,AutoTokenizer + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=hf_model_complete, + llm_model_name='meta-llama/Llama-3.1-8B-Instruct', + embedding_func=hf_embedding, + tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) diff --git a/examples/lightrag_openai_demo.py b/examples/lightrag_openai_demo.py new file mode 100644 index 000000000..677506c2f --- /dev/null +++ b/examples/lightrag_openai_demo.py @@ -0,0 +1,33 @@ +import os +import sys + +from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete +from transformers import AutoModel,AutoTokenizer + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_complete + # llm_model_func=gpt_4o_mini_complete +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) diff --git a/examples/query.py b/examples/query.py deleted file mode 100644 index 00c902ebb..000000000 --- a/examples/query.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import sys - -from lightrag import LightRAG, QueryParam - -# os.environ["OPENAI_API_KEY"] = "" - -WORKING_DIR = "" - -rag = LightRAG(working_dir=WORKING_DIR) - -mode = 'global' -query_param = QueryParam(mode=mode) - -result = rag.query("", param=query_param) -print(result) \ No newline at end of file diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 0b279096a..b3d1d4ca8 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG, QueryParam -__version__ = "0.0.3" +__version__ = "0.0.4" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/base.py b/lightrag/base.py index 9c0422feb..d677c406d 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -14,7 +14,7 @@ @dataclass class QueryParam: - mode: Literal["local", "global", "hybird", "naive"] = "global" + mode: Literal["local", "global", "hybrid", "naive"] = "global" only_need_context: bool = False response_type: str = "Multiple Paragraphs" top_k: int = 60 diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 9c34a6070..329bfd12d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -3,7 +3,8 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Type, cast +from typing import Type, cast, Any +from transformers import AutoModel,AutoTokenizer, AutoModelForCausalLM from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding,hf_model_complete,hf_embedding from .operate import ( @@ -11,7 +12,7 @@ extract_entities, local_query, global_query, - hybird_query, + hybrid_query, naive_query, ) @@ -38,15 +39,14 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: try: - # If there is already an event loop, use it. - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() except RuntimeError: - # If in a sub-thread, create a new event loop. logger.info("Creating a new event loop in a sub-thread.") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) return loop + @dataclass class LightRAG: working_dir: str = field( @@ -77,6 +77,9 @@ class LightRAG: ) # text embedding + tokenizer: Any = None + embed_model: Any = None + # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding) embedding_func: EmbeddingFunc = field(default_factory=lambda:openai_embedding)# embedding_batch_num: int = 32 @@ -100,6 +103,13 @@ class LightRAG: convert_response_to_json_func: callable = convert_response_to_json def __post_init__(self): + if callable(self.embedding_func) and self.embedding_func.__name__ == 'hf_embedding': + if self.tokenizer is None: + self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + if self.embed_model is None: + self.embed_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + + log_file = os.path.join(self.working_dir, "lightrag.log") set_logger(log_file) logger.info(f"Logger initialized for working directory: {self.working_dir}") @@ -130,8 +140,11 @@ def __post_init__(self): namespace="chunk_entity_relation", global_config=asdict(self) ) self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( - self.embedding_func + lambda texts: self.embedding_func(texts, self.tokenizer, self.embed_model) + if callable(self.embedding_func) and self.embedding_func.__name__ == 'hf_embedding' + else self.embedding_func(texts) ) + self.entities_vdb = ( self.vector_db_storage_cls( namespace="entities", @@ -267,8 +280,8 @@ async def aquery(self, query: str, param: QueryParam = QueryParam()): param, asdict(self), ) - elif param.mode == "hybird": - response = await hybird_query( + elif param.mode == "hybrid": + response = await hybrid_query( query, self.chunk_entity_relation_graph, self.entities_vdb, diff --git a/lightrag/llm.py b/lightrag/llm.py index 5fb27b045..bc2ac1f39 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -142,18 +142,14 @@ async def openai_embedding(texts: list[str]) -> np.ndarray: -global EMBED_MODEL -global tokenizer -EMBED_MODEL = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") -tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") @wrap_embedding_func_with_attrs( embedding_dim=384, max_token_size=5000, ) -async def hf_embedding(texts: list[str]) -> np.ndarray: +async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids with torch.no_grad(): - outputs = EMBED_MODEL(input_ids) + outputs = embed_model(input_ids) embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings.detach().numpy() diff --git a/lightrag/operate.py b/lightrag/operate.py index a8213a379..3d388cb61 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -827,7 +827,7 @@ async def _find_related_text_unit_from_relationships( return all_text_units -async def hybird_query( +async def hybrid_query( query, knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py index f7f7ee300..e97e2af62 100644 --- a/reproduce/Step_3.py +++ b/reproduce/Step_3.py @@ -52,7 +52,7 @@ def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file if __name__ == "__main__": cls = "agriculture" - mode = "hybird" + mode = "hybrid" WORKING_DIR = "../{cls}" rag = LightRAG(working_dir=WORKING_DIR) From 70baa4b572c71ec309eee88ce41d36bc189a04e8 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 19:55:30 +0800 Subject: [PATCH 29/67] update README.md --- README.md | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 562585e46..cc282c278 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@ This repository hosts the code of LightRAG. The structure of this code is based ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png)

+## 🎉 News +- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports Hugging Face models! + ## Install * Install from source @@ -35,17 +38,27 @@ pip install lightrag-hku ## Quick Start -* Set OpenAI API key in environment: `export OPENAI_API_KEY="sk-...".` -* Download the demo text "A Christmas Carol by Charles Dickens" +* Set OpenAI API key in environment if using OpenAI models: `export OPENAI_API_KEY="sk-...".` +* Download the demo text "A Christmas Carol by Charles Dickens": ```bash curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt ``` -Use the below python snippet: +Use the below Python snippet to initialize LightRAG and perform queries: ```python from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete -rag = LightRAG(working_dir="./dickens") +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete # Use gpt_4o_mini_complete LLM model + # llm_model_func=gpt_4o_complete # Optionally, use a stronger model +) with open("./book.txt") as f: rag.insert(f.read()) @@ -62,13 +75,31 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode= # Perform hybrid search print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) ``` -Batch Insert +### Using Hugging Face Models +If you want to use Hugging Face models, you only need to set LightRAG as follows: +```python +from lightrag.llm import hf_model_complete, hf_embedding +from transformers import AutoModel, AutoTokenizer + +# Initialize LightRAG with Hugging Face model +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=hf_model_complete, # Use Hugging Face complete model for text generation + llm_model_name='meta-llama/Llama-3.1-8B-Instruct', # Model name from Hugging Face + embedding_func=hf_embedding, # Use Hugging Face embedding function + tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +) +``` +### Batch Insert ```python +# Batch Insert: Insert multiple texts at once rag.insert(["TEXT1", "TEXT2",...]) ``` -Incremental Insert +### Incremental Insert ```python +# Incremental Insert: Insert new documents into an existing LightRAG instance rag = LightRAG(working_dir="./dickens") with open("./newText.txt") as f: From a92f7bfd619816c01a68d6d89aaddbf8651075f6 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 20:06:59 +0800 Subject: [PATCH 30/67] update llm.py --- lightrag/llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/llm.py b/lightrag/llm.py index bc2ac1f39..87b156c5b 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -118,9 +118,9 @@ async def gpt_4o_mini_complete( async def hf_model_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: - input_string = kwargs['hashing_kv'].global_config['llm_model_name'] + model_name = kwargs['hashing_kv'].global_config['llm_model_name'] return await hf_model_if_cache( - input_string, + model_name, prompt, system_prompt=system_prompt, history_messages=history_messages, From 2190425d95bd4623b71896b03833a177a2558952 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 21:11:12 +0800 Subject: [PATCH 31/67] fix bug --- examples/lightrag_hf_demo.py | 11 ++++++++--- examples/lightrag_openai_demo.py | 6 +++--- lightrag/lightrag.py | 18 +++--------------- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/examples/lightrag_hf_demo.py b/examples/lightrag_hf_demo.py index f0e5fa99c..4cd503b3c 100644 --- a/examples/lightrag_hf_demo.py +++ b/examples/lightrag_hf_demo.py @@ -3,6 +3,7 @@ from lightrag import LightRAG, QueryParam from lightrag.llm import hf_model_complete, hf_embedding +from lightrag.utils import EmbeddingFunc from transformers import AutoModel,AutoTokenizer WORKING_DIR = "./dickens" @@ -14,9 +15,13 @@ working_dir=WORKING_DIR, llm_model_func=hf_model_complete, llm_model_name='meta-llama/Llama-3.1-8B-Instruct', - embedding_func=hf_embedding, - tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), - embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + embedding_func=EmbeddingFunc( + tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embedding_dim=384, + max_token_size=5000, + func=hf_embedding + ), ) diff --git a/examples/lightrag_openai_demo.py b/examples/lightrag_openai_demo.py index 677506c2f..507b2eb23 100644 --- a/examples/lightrag_openai_demo.py +++ b/examples/lightrag_openai_demo.py @@ -5,15 +5,15 @@ from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete from transformers import AutoModel,AutoTokenizer -WORKING_DIR = "./dickens" +WORKING_DIR = "/home/zrguo/code/myrag/agriculture" if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) rag = LightRAG( working_dir=WORKING_DIR, - llm_model_func=gpt_4o_complete - # llm_model_func=gpt_4o_mini_complete + llm_model_func=gpt_4o_mini_complete + # llm_model_func=gpt_4o_complete ) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 329bfd12d..0d50a13d6 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -76,12 +76,8 @@ class LightRAG: } ) - # text embedding - tokenizer: Any = None - embed_model: Any = None - # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding) - embedding_func: EmbeddingFunc = field(default_factory=lambda:openai_embedding)# + embedding_func: EmbeddingFunc = field(default_factory=lambda:openai_embedding) embedding_batch_num: int = 32 embedding_func_max_async: int = 16 @@ -103,13 +99,6 @@ class LightRAG: convert_response_to_json_func: callable = convert_response_to_json def __post_init__(self): - if callable(self.embedding_func) and self.embedding_func.__name__ == 'hf_embedding': - if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - if self.embed_model is None: - self.embed_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - - log_file = os.path.join(self.working_dir, "lightrag.log") set_logger(log_file) logger.info(f"Logger initialized for working directory: {self.working_dir}") @@ -139,10 +128,9 @@ def __post_init__(self): self.chunk_entity_relation_graph = self.graph_storage_cls( namespace="chunk_entity_relation", global_config=asdict(self) ) + self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( - lambda texts: self.embedding_func(texts, self.tokenizer, self.embed_model) - if callable(self.embedding_func) and self.embedding_func.__name__ == 'hf_embedding' - else self.embedding_func(texts) + self.embedding_func ) self.entities_vdb = ( From 756133512ee1898ef2e774a6bb0f2130dd2720e2 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 21:21:57 +0800 Subject: [PATCH 32/67] fix bug --- examples/lightrag_hf_demo.py | 8 +++++--- examples/lightrag_openai_demo.py | 2 +- lightrag/__init__.py | 2 +- lightrag/llm.py | 5 ----- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/examples/lightrag_hf_demo.py b/examples/lightrag_hf_demo.py index 4cd503b3c..baf62bdbf 100644 --- a/examples/lightrag_hf_demo.py +++ b/examples/lightrag_hf_demo.py @@ -16,11 +16,13 @@ llm_model_func=hf_model_complete, llm_model_name='meta-llama/Llama-3.1-8B-Instruct', embedding_func=EmbeddingFunc( - tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), - embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), embedding_dim=384, max_token_size=5000, - func=hf_embedding + func=lambda texts: hf_embedding( + texts, + tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + ) ), ) diff --git a/examples/lightrag_openai_demo.py b/examples/lightrag_openai_demo.py index 507b2eb23..fb1f055c3 100644 --- a/examples/lightrag_openai_demo.py +++ b/examples/lightrag_openai_demo.py @@ -5,7 +5,7 @@ from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete from transformers import AutoModel,AutoTokenizer -WORKING_DIR = "/home/zrguo/code/myrag/agriculture" +WORKING_DIR = "./dickens" if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index b3d1d4ca8..dc8faa6a1 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG, QueryParam -__version__ = "0.0.4" +__version__ = "0.0.5" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/llm.py b/lightrag/llm.py index 87b156c5b..bcb7e495c 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -141,11 +141,6 @@ async def openai_embedding(texts: list[str]) -> np.ndarray: return np.array([dp.embedding for dp in response.data]) - -@wrap_embedding_func_with_attrs( - embedding_dim=384, - max_token_size=5000, -) async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids with torch.no_grad(): From 7409668f9f2726c9f702a5ac9ed01e415e9b97b3 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 21:23:03 +0800 Subject: [PATCH 33/67] update README.md --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cc282c278..12a422f61 100644 --- a/README.md +++ b/README.md @@ -86,9 +86,16 @@ rag = LightRAG( working_dir=WORKING_DIR, llm_model_func=hf_model_complete, # Use Hugging Face complete model for text generation llm_model_name='meta-llama/Llama-3.1-8B-Instruct', # Model name from Hugging Face - embedding_func=hf_embedding, # Use Hugging Face embedding function - tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), - embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + # Use Hugging Face embedding function + embedding_func=EmbeddingFunc( + embedding_dim=384, + max_token_size=5000, + func=lambda texts: hf_embedding( + texts, + tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + ) + ), ) ``` ### Batch Insert From 6389baef681477c89d653d776c7546d18c8a94b6 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 22:30:16 +0800 Subject: [PATCH 34/67] Add Star History --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 12a422f61..f594f7898 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,17 @@ def extract_queries(file_path): return queries ``` + +## Star History + + + + + + Star History Chart + + + ## Code Structure ```python From c3a5c8a8ff3777c61a7bb87ae64c12a3859dad6c Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 22:31:41 +0800 Subject: [PATCH 35/67] Add Star History --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 12a422f61..af79eec29 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,7 @@ def extract_queries(file_path): return queries ``` + ## Code Structure ```python @@ -346,6 +347,17 @@ def extract_queries(file_path): ├── requirements.txt └── setup.py ``` + +## Star History + + + + + + Star History Chart + + + ## Citation ```python From c8a7c4fc79f4b89d2311b089e6183688783cc987 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 15 Oct 2024 22:34:02 +0800 Subject: [PATCH 36/67] Add Star history --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f594f7898..af79eec29 100644 --- a/README.md +++ b/README.md @@ -319,16 +319,6 @@ def extract_queries(file_path): return queries ``` -## Star History - - - - - - Star History Chart - - - ## Code Structure ```python @@ -357,6 +347,17 @@ def extract_queries(file_path): ├── requirements.txt └── setup.py ``` + +## Star History + + + + + + Star History Chart + + + ## Citation ```python From b651a2ebaa26d3cf6763076ad776d7e9499ec138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?= <118165258+HKUDS@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:48:34 +0800 Subject: [PATCH 37/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index af79eec29..8d688cc90 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# LightRAG: Simple and Fast Retrieval-Augmented Generation +# 🚀 LightRAG: Simple and Fast Retrieval-Augmented Generation ![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) From a1e3ca4a3352604e68f0235821faa531a42376d5 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:57:57 +0800 Subject: [PATCH 38/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8d688cc90..39de81bc4 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🚀 LightRAG: Simple and Fast Retrieval-Augmented Generation +

🚀 LightRAG: Simple and Fast Retrieval-Augmented Generation

![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) From b0ad8775f49accb8bb69ef4976856e889e88734e Mon Sep 17 00:00:00 2001 From: Sung Kim Date: Tue, 15 Oct 2024 12:55:05 -0700 Subject: [PATCH 39/67] Added OpenAI compatible options and examples --- examples/lightrag_openai_compatible_demo.py | 69 +++++++++++++++++++++ lightrag/llm.py | 16 +++-- 2 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 examples/lightrag_openai_compatible_demo.py diff --git a/examples/lightrag_openai_compatible_demo.py b/examples/lightrag_openai_compatible_demo.py new file mode 100644 index 000000000..75ecc1180 --- /dev/null +++ b/examples/lightrag_openai_compatible_demo.py @@ -0,0 +1,69 @@ +import os +import asyncio +from lightrag import LightRAG, QueryParam +from lightrag.llm import openai_complete_if_cache, openai_embedding +from lightrag.utils import EmbeddingFunc +import numpy as np + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs + ) + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar" + ) + +# function test +async def test_funcs(): + result = await llm_model_func("How are you?") + print("llm_model_func: ", result) + + result = await embedding_func(["How are you?"]) + print("embedding_func: ", result) + +asyncio.run(test_funcs()) + + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, + max_token_size=8192, + func=embedding_func + ) +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) diff --git a/lightrag/llm.py b/lightrag/llm.py index bcb7e495c..d2ca5344e 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -19,9 +19,12 @@ retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) async def openai_complete_if_cache( - model, prompt, system_prompt=None, history_messages=[], **kwargs + model, prompt, system_prompt=None, history_messages=[], base_url=None, api_key=None, **kwargs ) -> str: - openai_async_client = AsyncOpenAI() + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + openai_async_client = AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages = [] if system_prompt: @@ -133,10 +136,13 @@ async def hf_model_complete( wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) -async def openai_embedding(texts: list[str]) -> np.ndarray: - openai_async_client = AsyncOpenAI() +async def openai_embedding(texts: list[str], model: str = "text-embedding-3-small", base_url: str = None, api_key: str = None) -> np.ndarray: + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + openai_async_client = AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) response = await openai_async_client.embeddings.create( - model="text-embedding-3-small", input=texts, encoding_format="float" + model=model, input=texts, encoding_format="float" ) return np.array([dp.embedding for dp in response.data]) From 10d1ac48855adbcb9827e26cef6aa64972770901 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Wed, 16 Oct 2024 15:15:10 +0800 Subject: [PATCH 40/67] ollama test --- examples/lightrag_ollama_demo.py | 40 +++++++++++++++++++++++++ lightrag/__init__.py | 2 +- lightrag/lightrag.py | 2 +- lightrag/llm.py | 50 ++++++++++++++++++++++++++++++-- requirements.txt | 3 ++ setup.py | 2 +- 6 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 examples/lightrag_ollama_demo.py diff --git a/examples/lightrag_ollama_demo.py b/examples/lightrag_ollama_demo.py new file mode 100644 index 000000000..a2d04aa64 --- /dev/null +++ b/examples/lightrag_ollama_demo.py @@ -0,0 +1,40 @@ +import os + +from lightrag import LightRAG, QueryParam +from lightrag.llm import ollama_model_complete, ollama_embedding +from lightrag.utils import EmbeddingFunc + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=ollama_model_complete, + llm_model_name='your_model_name', + embedding_func=EmbeddingFunc( + embedding_dim=768, + max_token_size=8192, + func=lambda texts: ollama_embedding( + texts, + embed_model="nomic-embed-text" + ) + ), +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index dc8faa6a1..b6b953f1c 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG, QueryParam -__version__ = "0.0.5" +__version__ = "0.0.6" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0d50a13d6..83312ef6e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -6,7 +6,7 @@ from typing import Type, cast, Any from transformers import AutoModel,AutoTokenizer, AutoModelForCausalLM -from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding,hf_model_complete,hf_embedding +from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding, hf_model_complete, hf_embedding from .operate import ( chunking_by_token_size, extract_entities, diff --git a/lightrag/llm.py b/lightrag/llm.py index d2ca5344e..7328a5833 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -1,5 +1,6 @@ import os import numpy as np +import ollama from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout from tenacity import ( retry, @@ -92,6 +93,34 @@ async def hf_model_if_cache( ) return response_text +async def ollama_model_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + kwargs.pop("max_tokens", None) + kwargs.pop("response_format", None) + + ollama_client = ollama.AsyncClient() + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + response = await ollama_client.chat(model=model, messages=messages, **kwargs) + + result = response["message"]["content"] + + if hashing_kv is not None: + await hashing_kv.upsert({args_hash: {"return": result, "model": model}}) + + return result async def gpt_4o_complete( prompt, system_prompt=None, history_messages=[], **kwargs @@ -116,8 +145,6 @@ async def gpt_4o_mini_complete( **kwargs, ) - - async def hf_model_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -130,6 +157,18 @@ async def hf_model_complete( **kwargs, ) +async def ollama_model_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + model_name = kwargs['hashing_kv'].global_config['llm_model_name'] + return await ollama_model_if_cache( + model_name, + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) @retry( stop=stop_after_attempt(3), @@ -154,6 +193,13 @@ async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings.detach().numpy() +async def ollama_embedding(texts: list[str], embed_model) -> np.ndarray: + embed_text = [] + for text in texts: + data = ollama.embeddings(model=embed_model, prompt=text) + embed_text.append(data["embedding"]) + + return embed_text if __name__ == "__main__": import asyncio diff --git a/requirements.txt b/requirements.txt index 8a74d5e2a..52edd1515 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ nano-vectordb hnswlib xxhash tenacity +transformers +torch +ollama \ No newline at end of file diff --git a/setup.py b/setup.py index 849fabfe9..472224206 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ import setuptools -with open("README.md", "r") as fh: +with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() From 2b49f6ecf53d8dc84b277de259b6e92e21862feb Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Wed, 16 Oct 2024 15:33:59 +0800 Subject: [PATCH 41/67] update README.md --- README.md | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 39de81bc4..6dedff97e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@

-

@@ -21,6 +20,7 @@ This repository hosts the code of LightRAG. The structure of this code is based

## 🎉 News +- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports Ollama models! - [x] [2024.10.15]🎯🎯📢📢LightRAG now supports Hugging Face models! ## Install @@ -37,7 +37,7 @@ pip install lightrag-hku ``` ## Quick Start - +* All the code can be found in the `examples`. * Set OpenAI API key in environment if using OpenAI models: `export OPENAI_API_KEY="sk-...".` * Download the demo text "A Christmas Carol by Charles Dickens": ```bash @@ -84,7 +84,7 @@ from transformers import AutoModel, AutoTokenizer # Initialize LightRAG with Hugging Face model rag = LightRAG( working_dir=WORKING_DIR, - llm_model_func=hf_model_complete, # Use Hugging Face complete model for text generation + llm_model_func=hf_model_complete, # Use Hugging Face model for text generation llm_model_name='meta-llama/Llama-3.1-8B-Instruct', # Model name from Hugging Face # Use Hugging Face embedding function embedding_func=EmbeddingFunc( @@ -98,6 +98,27 @@ rag = LightRAG( ), ) ``` +### Using Ollama Models +If you want to use Ollama models, you only need to set LightRAG as follows: +```python +from lightrag.llm import ollama_model_complete, ollama_embedding + +# Initialize LightRAG with Ollama model +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=ollama_model_complete, # Use Ollama model for text generation + llm_model_name='your_model_name', # Your model name + # Use Ollama embedding function + embedding_func=EmbeddingFunc( + embedding_dim=768, + max_token_size=8192, + func=lambda texts: ollama_embedding( + texts, + embed_model="nomic-embed-text" + ) + ), +) +``` ### Batch Insert ```python # Batch Insert: Insert multiple texts at once @@ -326,8 +347,10 @@ def extract_queries(file_path): ├── examples │ ├── batch_eval.py │ ├── generate_query.py -│ ├── lightrag_openai_demo.py -│ └── lightrag_hf_demo.py +│ ├── lightrag_hf_demo.py +│ ├── lightrag_ollama_demo.py +│ ├── lightrag_openai_compatible_demo.py +│ └── lightrag_openai_demo.py ├── lightrag │ ├── __init__.py │ ├── base.py From 72200af1692c7b85edbacb295af6d19bc62d2192 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Wed, 16 Oct 2024 17:37:11 +0800 Subject: [PATCH 42/67] update requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 52edd1515..f7dcd7870 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ xxhash tenacity transformers torch -ollama \ No newline at end of file +ollama +accelerate \ No newline at end of file From cf869fc6803c248bd201f4031027d031caabaf32 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Wed, 16 Oct 2024 17:45:49 +0800 Subject: [PATCH 43/67] update README.md --- README.md | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6dedff97e..358115c0b 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ This repository hosts the code of LightRAG. The structure of this code is based
## 🎉 News -- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports Ollama models! -- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports Hugging Face models! +- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-ollama-models)! +- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-hugging-face-models)! ## Install @@ -75,6 +75,42 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode= # Perform hybrid search print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) ``` + +### Open AI-like APIs +LightRAG also support Open AI-like chat/embeddings APIs: +```python +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs + ) + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar" + ) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, + max_token_size=8192, + func=embedding_func + ) +) +``` + ### Using Hugging Face Models If you want to use Hugging Face models, you only need to set LightRAG as follows: ```python @@ -98,6 +134,7 @@ rag = LightRAG( ), ) ``` + ### Using Ollama Models If you want to use Ollama models, you only need to set LightRAG as follows: ```python @@ -119,11 +156,13 @@ rag = LightRAG( ), ) ``` + ### Batch Insert ```python # Batch Insert: Insert multiple texts at once rag.insert(["TEXT1", "TEXT2",...]) ``` + ### Incremental Insert ```python @@ -207,6 +246,7 @@ Output your evaluation in the following JSON format: }} }} ``` + ### Overall Performance Table | | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | |----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| @@ -233,6 +273,7 @@ Output your evaluation in the following JSON format: ## Reproduce All the code can be found in the `./reproduce` directory. + ### Step-0 Extract Unique Contexts First, we need to extract unique contexts in the datasets. ```python @@ -286,6 +327,7 @@ def extract_unique_contexts(input_directory, output_directory): print("All files have been processed.") ``` + ### Step-1 Insert Contexts For the extracted contexts, we insert them into the LightRAG system. @@ -307,6 +349,7 @@ def insert_text(rag, file_path): if retries == max_retries: print("Insertion failed after exceeding the maximum number of retries") ``` + ### Step-2 Generate Queries We extract tokens from both the first half and the second half of each context in the dataset, then combine them as the dataset description to generate queries. From 1e74af59a7ab34d2c3ba9b662c370bfdc58780b1 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Wed, 16 Oct 2024 18:24:47 +0800 Subject: [PATCH 44/67] Update README.md --- README.md | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 358115c0b..fb29945bb 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-ollama-models)! -- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-hugging-face-models)! +- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! +- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! ## Install @@ -76,7 +76,9 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode= print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) ``` -### Open AI-like APIs +
+ Using Open AI-like APIs + LightRAG also support Open AI-like chat/embeddings APIs: ```python async def llm_model_func( @@ -110,8 +112,11 @@ rag = LightRAG( ) ) ``` +
-### Using Hugging Face Models +
+ Using Hugging Face Models + If you want to use Hugging Face models, you only need to set LightRAG as follows: ```python from lightrag.llm import hf_model_complete, hf_embedding @@ -134,9 +139,12 @@ rag = LightRAG( ), ) ``` +
-### Using Ollama Models +
+ Using Ollama Models If you want to use Ollama models, you only need to set LightRAG as follows: + ```python from lightrag.llm import ollama_model_complete, ollama_embedding @@ -156,6 +164,7 @@ rag = LightRAG( ), ) ``` +
### Batch Insert ```python @@ -178,6 +187,10 @@ The dataset used in LightRAG can be download from [TommyChien/UltraDomain](https ### Generate Query LightRAG uses the following prompt to generate high-level queries, with the corresponding code located in `example/generate_query.py`. + +
+ Prompt + ```python Given the following description of a dataset: @@ -201,9 +214,14 @@ Output the results in the following structure: - User 5: [user description] ... ``` +
### Batch Eval To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`. + +
+ Prompt + ```python ---Role--- You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. @@ -246,6 +264,7 @@ Output your evaluation in the following JSON format: }} }} ``` +
### Overall Performance Table | | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | @@ -276,6 +295,10 @@ All the code can be found in the `./reproduce` directory. ### Step-0 Extract Unique Contexts First, we need to extract unique contexts in the datasets. + +
+ Code + ```python def extract_unique_contexts(input_directory, output_directory): @@ -327,10 +350,14 @@ def extract_unique_contexts(input_directory, output_directory): print("All files have been processed.") ``` +
### Step-1 Insert Contexts For the extracted contexts, we insert them into the LightRAG system. +
+ Code + ```python def insert_text(rag, file_path): with open(file_path, mode='r') as f: @@ -349,10 +376,15 @@ def insert_text(rag, file_path): if retries == max_retries: print("Insertion failed after exceeding the maximum number of retries") ``` +
### Step-2 Generate Queries We extract tokens from both the first half and the second half of each context in the dataset, then combine them as the dataset description to generate queries. + +
+ Code + ```python tokenizer = GPT2Tokenizer.from_pretrained('gpt2') @@ -368,9 +400,14 @@ def get_summary(context, tot_tokens=2000): return summary ``` +
### Step-3 Query For the queries generated in Step-2, we will extract them and query LightRAG. + +
+ Code + ```python def extract_queries(file_path): with open(file_path, 'r') as f: @@ -382,6 +419,7 @@ def extract_queries(file_path): return queries ``` +
## Code Structure From 7ab699955e05d35ea89d4b46fb72138e15dcc877 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:29:08 +0800 Subject: [PATCH 45/67] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb29945bb..7ad8dd26d 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## Install -* Install from source +* Install from source (Recommend) ```bash cd LightRAG @@ -142,7 +142,7 @@ rag = LightRAG(
- Using Ollama Models + Using Ollama Models (There are some bugs. I'll fix them ASAP.) If you want to use Ollama models, you only need to set LightRAG as follows: ```python From 0e0a037a1d15743798286146c998e6cfa29ddc1e Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 17 Oct 2024 14:39:11 +0800 Subject: [PATCH 46/67] Add Discord channel link --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7ad8dd26d..ff6fe44a1 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,10 @@

- +

+ @@ -20,6 +21,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News +- [x] [2024.10.17]🎯🎯📢📢We have created a [Discord channel](https://discord.gg/mvsfu2Tg)! Welcome to join for sharing and discussions! 🎉🎉 - [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! - [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! From a2f1654f4cc2eeb73b38ca6e1d2ff787bc514a34 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 17 Oct 2024 16:02:43 +0800 Subject: [PATCH 47/67] fix Ollama bugs --- README.md | 2 +- lightrag/operate.py | 81 ++++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index ff6fe44a1..fd85141bf 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ rag = LightRAG(

- Using Ollama Models (There are some bugs. I'll fix them ASAP.) + Using Ollama Models If you want to use Ollama models, you only need to set LightRAG as follows: ```python diff --git a/lightrag/operate.py b/lightrag/operate.py index 3d388cb61..3a17810a0 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -387,6 +387,7 @@ async def local_query( query_param: QueryParam, global_config: dict, ) -> str: + context = None use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] @@ -399,7 +400,9 @@ async def local_query( keywords = ', '.join(keywords) except json.JSONDecodeError as e: try: - result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip().strip('```').strip('json') + result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip() + result = '{' + result.split('{')[1].split('}')[0] + '}' + keywords_data = json.loads(result) keywords = keywords_data.get("low_level_keywords", []) keywords = ', '.join(keywords) @@ -407,13 +410,14 @@ async def local_query( except json.JSONDecodeError as e: print(f"JSON parsing error: {e}") return PROMPTS["fail_response"] - context = await _build_local_query_context( - keywords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) + if keywords: + context = await _build_local_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) if query_param.only_need_context: return context if context is None: @@ -614,6 +618,7 @@ async def global_query( query_param: QueryParam, global_config: dict, ) -> str: + context = None use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] @@ -626,7 +631,9 @@ async def global_query( keywords = ', '.join(keywords) except json.JSONDecodeError as e: try: - result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip().strip('```').strip('json') + result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip() + result = '{' + result.split('{')[1].split('}')[0] + '}' + keywords_data = json.loads(result) keywords = keywords_data.get("high_level_keywords", []) keywords = ', '.join(keywords) @@ -635,15 +642,15 @@ async def global_query( # Handle parsing error print(f"JSON parsing error: {e}") return PROMPTS["fail_response"] - - context = await _build_global_query_context( - keywords, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) + if keywords: + context = await _build_global_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) if query_param.only_need_context: return context @@ -836,6 +843,8 @@ async def hybrid_query( query_param: QueryParam, global_config: dict, ) -> str: + low_level_context = None + high_level_context = None use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] @@ -850,7 +859,9 @@ async def hybrid_query( ll_keywords = ', '.join(ll_keywords) except json.JSONDecodeError as e: try: - result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip().strip('```').strip('json') + result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip() + result = '{' + result.split('{')[1].split('}')[0] + '}' + keywords_data = json.loads(result) hl_keywords = keywords_data.get("high_level_keywords", []) ll_keywords = keywords_data.get("low_level_keywords", []) @@ -861,22 +872,24 @@ async def hybrid_query( print(f"JSON parsing error: {e}") return PROMPTS["fail_response"] - low_level_context = await _build_local_query_context( - ll_keywords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) + if ll_keywords: + low_level_context = await _build_local_query_context( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) - high_level_context = await _build_global_query_context( - hl_keywords, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) + if hl_keywords: + high_level_context = await _build_global_query_context( + hl_keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) context = combine_contexts(high_level_context, low_level_context) From 70dbca190e296fd1aeeb45d384af06eeaced3285 Mon Sep 17 00:00:00 2001 From: KIM Jae Boum Date: Fri, 18 Oct 2024 06:06:47 +0800 Subject: [PATCH 48/67] update Step_3.py and openai compatible script --- reproduce/Step_1_openai_compatible.py | 66 ++++++++++++++++++ reproduce/Step_3.py | 4 +- reproduce/Step_3_openai_compatible.py | 99 +++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 2 deletions(-) create mode 100644 reproduce/Step_1_openai_compatible.py create mode 100644 reproduce/Step_3_openai_compatible.py diff --git a/reproduce/Step_1_openai_compatible.py b/reproduce/Step_1_openai_compatible.py new file mode 100644 index 000000000..b5c6aef3c --- /dev/null +++ b/reproduce/Step_1_openai_compatible.py @@ -0,0 +1,66 @@ +import os +import json +import time +import numpy as np + +from lightrag import LightRAG +from lightrag.utils import EmbeddingFunc +from lightrag.llm import openai_complete_if_cache, openai_embedding + +## For Upstage API +# please check if embedding_dim=4096 in lightrag.py and llm.py in lightrag direcotry +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs + ) + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar" + ) +## /For Upstage API + +def insert_text(rag, file_path): + with open(file_path, mode='r') as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") + +cls = "mix" +WORKING_DIR = f"../{cls}" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG(working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, + max_token_size=8192, + func=embedding_func + ) + ) + +insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py index e97e2af62..a79ebd17a 100644 --- a/reproduce/Step_3.py +++ b/reproduce/Step_3.py @@ -53,10 +53,10 @@ def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file if __name__ == "__main__": cls = "agriculture" mode = "hybrid" - WORKING_DIR = "../{cls}" + WORKING_DIR = f"../{cls}" rag = LightRAG(working_dir=WORKING_DIR) query_param = QueryParam(mode=mode) queries = extract_queries(f"../datasets/questions/{cls}_questions.txt") - run_queries_and_save_to_json(queries, rag, query_param, "result.json", "errors.json") \ No newline at end of file + run_queries_and_save_to_json(queries, rag, query_param, f"{cls}_result.json", f"{cls}_errors.json") diff --git a/reproduce/Step_3_openai_compatible.py b/reproduce/Step_3_openai_compatible.py new file mode 100644 index 000000000..7b3079a9b --- /dev/null +++ b/reproduce/Step_3_openai_compatible.py @@ -0,0 +1,99 @@ +import os +import re +import json +import asyncio +from lightrag import LightRAG, QueryParam +from tqdm import tqdm +from lightrag.llm import openai_complete_if_cache, openai_embedding +from lightrag.utils import EmbeddingFunc +import numpy as np + +## For Upstage API +# please check if embedding_dim=4096 in lightrag.py and llm.py in lightrag direcotry +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs + ) + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar" + ) +## /For Upstage API + +def extract_queries(file_path): + with open(file_path, 'r') as f: + data = f.read() + + data = data.replace('**', '') + + queries = re.findall(r'- Question \d+: (.+)', data) + + return queries + +async def process_query(query_text, rag_instance, query_param): + try: + result, context = await rag_instance.aquery(query_text, param=query_param) + return {"query": query_text, "result": result, "context": context}, None + except Exception as e: + return None, {"query": query_text, "error": str(e)} + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + +def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file, error_file): + loop = always_get_an_event_loop() + + with open(output_file, 'a', encoding='utf-8') as result_file, open(error_file, 'a', encoding='utf-8') as err_file: + result_file.write("[\n") + first_entry = True + + for query_text in tqdm(queries, desc="Processing queries", unit="query"): + result, error = loop.run_until_complete(process_query(query_text, rag_instance, query_param)) + + if result: + if not first_entry: + result_file.write(",\n") + json.dump(result, result_file, ensure_ascii=False, indent=4) + first_entry = False + elif error: + json.dump(error, err_file, ensure_ascii=False, indent=4) + err_file.write("\n") + + result_file.write("\n]") + +if __name__ == "__main__": + cls = "mix" + mode = "hybrid" + WORKING_DIR = f"../{cls}" + + rag = LightRAG(working_dir=WORKING_DIR) + rag = LightRAG(working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, + max_token_size=8192, + func=embedding_func + ) + ) + query_param = QueryParam(mode=mode) + + base_dir='../datasets/questions' + queries = extract_queries(f"{base_dir}/{cls}_questions.txt") + run_queries_and_save_to_json(queries, rag, query_param, f"{base_dir}/result.json", f"{base_dir}/errors.json") From 996c9543a55d773ef37930d7569c63c742b925fd Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:14:14 +0800 Subject: [PATCH 49/67] Add a link to a LightRAG explanatory video --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index fd85141bf..2987507d8 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@

+

@@ -21,6 +22,7 @@ This repository hosts the code of LightRAG. The structure of this code is based
## 🎉 News +- [x] [2024.10.18]🎯🎯📢📢We’ve added a link to a [LightRAG explanatory video](https://youtu.be/oageL-1I0GE). Thanks to the author! - [x] [2024.10.17]🎯🎯📢📢We have created a [Discord channel](https://discord.gg/mvsfu2Tg)! Welcome to join for sharing and discussions! 🎉🎉 - [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! - [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! From d04f70d4254eb024e8bd2347594d29149413363f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?= <118165258+HKUDS@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:45:30 +0800 Subject: [PATCH 50/67] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2987507d8..d0ed8a357 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2024.10.18]🎯🎯📢📢We’ve added a link to a [LightRAG explanatory video](https://youtu.be/oageL-1I0GE). Thanks to the author! +- [x] [2024.10.18]🎯🎯📢📢We’ve added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! - [x] [2024.10.17]🎯🎯📢📢We have created a [Discord channel](https://discord.gg/mvsfu2Tg)! Welcome to join for sharing and discussions! 🎉🎉 - [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! - [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! From f576a28e0d66904a382f3eae076f1ff2699a6239 Mon Sep 17 00:00:00 2001 From: zrguo Date: Fri, 18 Oct 2024 15:32:58 +0800 Subject: [PATCH 51/67] Create lightrag_azure_openai_demo.py --- examples/lightrag_azure_openai_demo.py | 125 +++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 examples/lightrag_azure_openai_demo.py diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py new file mode 100644 index 000000000..62282a258 --- /dev/null +++ b/examples/lightrag_azure_openai_demo.py @@ -0,0 +1,125 @@ +import os +import asyncio +from lightrag import LightRAG, QueryParam +from lightrag.utils import EmbeddingFunc +import numpy as np +from dotenv import load_dotenv +import aiohttp +import logging + +logging.basicConfig(level=logging.INFO) + +load_dotenv() + +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") +AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") +AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") + +AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT") +AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION") + +WORKING_DIR = "./dickens" + +if os.path.exists(WORKING_DIR): + import shutil + + shutil.rmtree(WORKING_DIR) + +os.mkdir(WORKING_DIR) + + +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + headers = { + "Content-Type": "application/json", + "api-key": AZURE_OPENAI_API_KEY, + } + endpoint = f"{AZURE_OPENAI_ENDPOINT}openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/chat/completions?api-version={AZURE_OPENAI_API_VERSION}" + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + if history_messages: + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + payload = { + "messages": messages, + "temperature": kwargs.get("temperature", 0), + "top_p": kwargs.get("top_p", 1), + "n": kwargs.get("n", 1), + } + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, headers=headers, json=payload) as response: + if response.status != 200: + raise ValueError( + f"Request failed with status {response.status}: {await response.text()}" + ) + result = await response.json() + return result["choices"][0]["message"]["content"] + + +async def embedding_func(texts: list[str]) -> np.ndarray: + headers = { + "Content-Type": "application/json", + "api-key": AZURE_OPENAI_API_KEY, + } + endpoint = f"{AZURE_OPENAI_ENDPOINT}openai/deployments/{AZURE_EMBEDDING_DEPLOYMENT}/embeddings?api-version={AZURE_EMBEDDING_API_VERSION}" + + payload = {"input": texts} + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, headers=headers, json=payload) as response: + if response.status != 200: + raise ValueError( + f"Request failed with status {response.status}: {await response.text()}" + ) + result = await response.json() + embeddings = [item["embedding"] for item in result["data"]] + return np.array(embeddings) + + +async def test_funcs(): + result = await llm_model_func("How are you?") + print("Resposta do llm_model_func: ", result) + + result = await embedding_func(["How are you?"]) + print("Resultado do embedding_func: ", result.shape) + print("Dimensão da embedding: ", result.shape[1]) + + +asyncio.run(test_funcs()) + +embedding_dimension = 3072 + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=8192, + func=embedding_func, + ), +) + +book1 = open("./book_1.txt", encoding="utf-8") +book2 = open("./book_2.txt", encoding="utf-8") + +rag.insert([book1.read(), book2.read()]) + +query_text = "What are the main themes?" + +print("Result (Naive):") +print(rag.query(query_text, param=QueryParam(mode="naive"))) + +print("\nResult (Local):") +print(rag.query(query_text, param=QueryParam(mode="local"))) + +print("\nResult (Global):") +print(rag.query(query_text, param=QueryParam(mode="global"))) + +print("\nResult (Hybrid):") +print(rag.query(query_text, param=QueryParam(mode="hybrid"))) \ No newline at end of file From e7a7ff62b264ae7dde437c8dac3e32847090805a Mon Sep 17 00:00:00 2001 From: zrguo Date: Fri, 18 Oct 2024 15:33:11 +0800 Subject: [PATCH 52/67] Update operate.py --- lightrag/operate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 3a17810a0..930ceb2a0 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -76,7 +76,7 @@ async def _handle_single_entity_extraction( record_attributes: list[str], chunk_key: str, ): - if record_attributes[0] != '"entity"' or len(record_attributes) < 4: + if len(record_attributes) < 4 or record_attributes[0] != '"entity"': return None # add this record as a node in the G entity_name = clean_str(record_attributes[1].upper()) @@ -97,7 +97,7 @@ async def _handle_single_relationship_extraction( record_attributes: list[str], chunk_key: str, ): - if record_attributes[0] != '"relationship"' or len(record_attributes) < 5: + if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': return None # add this record as edge source = clean_str(record_attributes[1].upper()) From 705087529524ec96602435cd5eb736f0632e1d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Galego?= Date: Fri, 18 Oct 2024 14:17:14 +0100 Subject: [PATCH 53/67] Added support for Amazon Bedrock models --- .gitignore | 4 + examples/lightrag_bedrock_demo.py | 48 +++++++++++ lightrag/llm.py | 128 ++++++++++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 181 insertions(+) create mode 100644 .gitignore create mode 100644 examples/lightrag_bedrock_demo.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..cb457220e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +*.egg-info +dickens/ +book.txt \ No newline at end of file diff --git a/examples/lightrag_bedrock_demo.py b/examples/lightrag_bedrock_demo.py new file mode 100644 index 000000000..36ec38578 --- /dev/null +++ b/examples/lightrag_bedrock_demo.py @@ -0,0 +1,48 @@ +""" +LightRAG meets Amazon Bedrock ⛰️ +""" + +import os + +from lightrag import LightRAG, QueryParam +from lightrag.llm import bedrock_complete, bedrock_embedding +from lightrag.utils import EmbeddingFunc + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=bedrock_complete, + llm_model_name="anthropic.claude-3-haiku-20240307-v1:0", + node2vec_params = { + 'dimensions': 1024, + 'num_walks': 10, + 'walk_length': 40, + 'window_size': 2, + 'iterations': 3, + 'random_seed': 3 + }, + embedding_func=EmbeddingFunc( + embedding_dim=1024, + max_token_size=8192, + func=lambda texts: bedrock_embedding(texts) + ) +) + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) diff --git a/lightrag/llm.py b/lightrag/llm.py index 7328a5833..8fc0da2ee 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -1,4 +1,6 @@ import os +import json +import aioboto3 import numpy as np import ollama from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout @@ -48,6 +50,54 @@ async def openai_complete_if_cache( ) return response.choices[0].message.content +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), +) +async def bedrock_complete_if_cache( + model, prompt, system_prompt=None, history_messages=[], base_url=None, + aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, **kwargs +) -> str: + os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('AWS_ACCESS_KEY_ID', aws_access_key_id) + os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('AWS_SECRET_ACCESS_KEY', aws_secret_access_key) + os.environ['AWS_SESSION_TOKEN'] = os.environ.get('AWS_SESSION_TOKEN', aws_session_token) + + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + + messages = [] + messages.extend(history_messages) + messages.append({'role': "user", 'content': [{'text': prompt}]}) + + args = { + 'modelId': model, + 'messages': messages + } + + if system_prompt: + args['system'] = [{'text': system_prompt}] + + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + session = aioboto3.Session() + async with session.client("bedrock-runtime") as bedrock_async_client: + + response = await bedrock_async_client.converse(**args, **kwargs) + + if hashing_kv is not None: + await hashing_kv.upsert({ + args_hash: { + 'return': response['output']['message']['content'][0]['text'], + 'model': model + } + }) + + return response['output']['message']['content'][0]['text'] + async def hf_model_if_cache( model, prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -145,6 +195,19 @@ async def gpt_4o_mini_complete( **kwargs, ) + +async def bedrock_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await bedrock_complete_if_cache( + "anthropic.claude-3-sonnet-20240229-v1:0", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + async def hf_model_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -186,6 +249,71 @@ async def openai_embedding(texts: list[str], model: str = "text-embedding-3-smal return np.array([dp.embedding for dp in response.data]) +# @wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192) +# @retry( +# stop=stop_after_attempt(3), +# wait=wait_exponential(multiplier=1, min=4, max=10), +# retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), # TODO: fix exceptions +# ) +async def bedrock_embedding( + texts: list[str], model: str = "amazon.titan-embed-text-v2:0", + aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None) -> np.ndarray: + os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('AWS_ACCESS_KEY_ID', aws_access_key_id) + os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('AWS_SECRET_ACCESS_KEY', aws_secret_access_key) + os.environ['AWS_SESSION_TOKEN'] = os.environ.get('AWS_SESSION_TOKEN', aws_session_token) + + session = aioboto3.Session() + async with session.client("bedrock-runtime") as bedrock_async_client: + + if (model_provider := model.split(".")[0]) == "amazon": + embed_texts = [] + for text in texts: + if "v2" in model: + body = json.dumps({ + 'inputText': text, + # 'dimensions': embedding_dim, + 'embeddingTypes': ["float"] + }) + elif "v1" in model: + body = json.dumps({ + 'inputText': text + }) + else: + raise ValueError(f"Model {model} is not supported!") + + response = await bedrock_async_client.invoke_model( + modelId=model, + body=body, + accept="application/json", + contentType="application/json" + ) + + response_body = await response.get('body').json() + + embed_texts.append(response_body['embedding']) + elif model_provider == "cohere": + body = json.dumps({ + 'texts': texts, + 'input_type': "search_document", + 'truncate': "NONE" + }) + + response = await bedrock_async_client.invoke_model( + model=model, + body=body, + accept="application/json", + contentType="application/json" + ) + + response_body = json.loads(response.get('body').read()) + + embed_texts = response_body['embeddings'] + else: + raise ValueError(f"Model provider '{model_provider}' is not supported!") + + return np.array(embed_texts) + + async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids with torch.no_grad(): diff --git a/requirements.txt b/requirements.txt index f7dcd7870..a1054692a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +aioboto3 openai tiktoken networkx From 75a91d9300aa62cf0e918003e430e391c8d69ccc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Galego?= Date: Fri, 18 Oct 2024 16:50:02 +0100 Subject: [PATCH 54/67] Fixed retry strategy, message history and inference params; Cleaned up Bedrock example --- examples/lightrag_bedrock_demo.py | 39 +++++++++++-------------- lightrag/llm.py | 48 +++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/examples/lightrag_bedrock_demo.py b/examples/lightrag_bedrock_demo.py index 36ec38578..c515922e0 100644 --- a/examples/lightrag_bedrock_demo.py +++ b/examples/lightrag_bedrock_demo.py @@ -3,46 +3,39 @@ """ import os +import logging from lightrag import LightRAG, QueryParam from lightrag.llm import bedrock_complete, bedrock_embedding from lightrag.utils import EmbeddingFunc -WORKING_DIR = "./dickens" +logging.getLogger("aiobotocore").setLevel(logging.WARNING) +WORKING_DIR = "./dickens" if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) rag = LightRAG( working_dir=WORKING_DIR, llm_model_func=bedrock_complete, - llm_model_name="anthropic.claude-3-haiku-20240307-v1:0", - node2vec_params = { - 'dimensions': 1024, - 'num_walks': 10, - 'walk_length': 40, - 'window_size': 2, - 'iterations': 3, - 'random_seed': 3 - }, + llm_model_name="Anthropic Claude 3 Haiku // Amazon Bedrock", embedding_func=EmbeddingFunc( embedding_dim=1024, max_token_size=8192, - func=lambda texts: bedrock_embedding(texts) + func=bedrock_embedding ) ) -with open("./book.txt") as f: +with open("./book.txt", 'r', encoding='utf-8') as f: rag.insert(f.read()) -# Naive search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) - -# Local search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) - -# Global search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) - -# Hybrid search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +for mode in ["naive", "local", "global", "hybrid"]: + print("\n+-" + "-" * len(mode) + "-+") + print(f"| {mode.capitalize()} |") + print("+-" + "-" * len(mode) + "-+\n") + print( + rag.query( + "What are the top themes in this story?", + param=QueryParam(mode=mode) + ) + ) diff --git a/lightrag/llm.py b/lightrag/llm.py index 8fc0da2ee..48defb4dd 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -1,6 +1,9 @@ import os +import copy import json +import botocore import aioboto3 +import botocore.errorfactory import numpy as np import ollama from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout @@ -50,43 +53,70 @@ async def openai_complete_if_cache( ) return response.choices[0].message.content + +class BedrockError(Exception): + """Generic error for issues related to Amazon Bedrock""" + + @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, max=60), + retry=retry_if_exception_type((BedrockError)), ) async def bedrock_complete_if_cache( - model, prompt, system_prompt=None, history_messages=[], base_url=None, + model, prompt, system_prompt=None, history_messages=[], aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, **kwargs ) -> str: os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('AWS_ACCESS_KEY_ID', aws_access_key_id) os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('AWS_SECRET_ACCESS_KEY', aws_secret_access_key) os.environ['AWS_SESSION_TOKEN'] = os.environ.get('AWS_SESSION_TOKEN', aws_session_token) - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) - + # Fix message history format messages = [] - messages.extend(history_messages) + for history_message in history_messages: + message = copy.copy(history_message) + message['content'] = [{'text': message['content']}] + messages.append(message) + + # Add user prompt messages.append({'role': "user", 'content': [{'text': prompt}]}) + # Initialize Converse API arguments args = { 'modelId': model, 'messages': messages } + # Define system prompt if system_prompt: args['system'] = [{'text': system_prompt}] + # Map and set up inference parameters + inference_params_map = { + 'max_tokens': "maxTokens", + 'top_p': "topP", + 'stop_sequences': "stopSequences" + } + if (inference_params := list(set(kwargs) & set(['max_tokens', 'temperature', 'top_p', 'stop_sequences']))): + args['inferenceConfig'] = {} + for param in inference_params: + args['inferenceConfig'][inference_params_map.get(param, param)] = kwargs.pop(param) + + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) if hashing_kv is not None: args_hash = compute_args_hash(model, messages) if_cache_return = await hashing_kv.get_by_id(args_hash) if if_cache_return is not None: return if_cache_return["return"] + # Call model via Converse API session = aioboto3.Session() async with session.client("bedrock-runtime") as bedrock_async_client: - response = await bedrock_async_client.converse(**args, **kwargs) + try: + response = await bedrock_async_client.converse(**args, **kwargs) + except Exception as e: + raise BedrockError(e) if hashing_kv is not None: await hashing_kv.upsert({ @@ -200,7 +230,7 @@ async def bedrock_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: return await bedrock_complete_if_cache( - "anthropic.claude-3-sonnet-20240229-v1:0", + "anthropic.claude-3-haiku-20240307-v1:0", prompt, system_prompt=system_prompt, history_messages=history_messages, From a7b43d27dbe2e77c7cf666ba0327e08ec60815b9 Mon Sep 17 00:00:00 2001 From: Wade Rosko <7385473+wrosko@users.noreply.github.com> Date: Fri, 18 Oct 2024 18:09:48 -0600 Subject: [PATCH 55/67] Add comment specifying jupyter req Add lines that can be uncommented if running in a jupyter notebook --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d0ed8a357..bd226582e 100644 --- a/README.md +++ b/README.md @@ -47,12 +47,21 @@ pip install lightrag-hku ```bash curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt ``` -Use the below Python snippet to initialize LightRAG and perform queries: +Use the below Python snippet (in a script) to initialize LightRAG and perform queries: ```python from lightrag import LightRAG, QueryParam from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete +######### +# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert() +# import nest_asyncio +# nest_asyncio.apply() +######### + +WORKING_DIR = "./dickens" + + WORKING_DIR = "./dickens" if not os.path.exists(WORKING_DIR): From e2db7b6c45ac4b48d7026d69b3a770b42bad4dbe Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:46:03 +0800 Subject: [PATCH 56/67] fix prompt.py --- lightrag/prompt.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 5d28e49c5..67d52d638 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -163,25 +163,10 @@ {response_type} - ---Data tables--- {context_data} - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. - -If you don't know the answer, just say so. Do not make anything up. - -Do not include information where the supporting evidence for it is not provided. - - ----Target response length and format--- - -{response_type} - Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. """ From 744dad339d6b06505659ab5b1091180aecdc4c3b Mon Sep 17 00:00:00 2001 From: Sanketh Kumar Date: Sat, 19 Oct 2024 09:43:17 +0530 Subject: [PATCH 57/67] chore: added pre-commit-hooks and ruff formatting for commit-hooks --- .gitignore | 3 +- .pre-commit-config.yaml | 22 ++ README.md | 50 ++--- examples/batch_eval.py | 38 ++-- examples/generate_query.py | 9 +- examples/lightrag_azure_openai_demo.py | 2 +- examples/lightrag_bedrock_demo.py | 13 +- examples/lightrag_hf_demo.py | 35 ++- examples/lightrag_ollama_demo.py | 25 ++- examples/lightrag_openai_compatible_demo.py | 32 ++- examples/lightrag_openai_demo.py | 22 +- lightrag/__init__.py | 2 +- lightrag/base.py | 11 +- lightrag/lightrag.py | 65 +++--- lightrag/llm.py | 223 ++++++++++++------- lightrag/operate.py | 229 +++++++++++++------- lightrag/prompt.py | 14 +- lightrag/storage.py | 15 +- lightrag/utils.py | 28 ++- reproduce/Step_0.py | 24 +- reproduce/Step_1.py | 8 +- reproduce/Step_1_openai_compatible.py | 29 ++- reproduce/Step_2.py | 20 +- reproduce/Step_3.py | 29 ++- reproduce/Step_3_openai_compatible.py | 54 +++-- requirements.txt | 16 +- 26 files changed, 630 insertions(+), 388 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.gitignore b/.gitignore index cb457220e..50f384ec3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ *.egg-info dickens/ -book.txt \ No newline at end of file +book.txt +lightrag-dev/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..db531bb62 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: requirements-txt-fixer + + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.4 + hooks: + - id: ruff-format + - id: ruff + args: [--fix] + + + - repo: https://github.com/mgedmin/check-manifest + rev: "0.49" + hooks: + - id: check-manifest + stages: [manual] diff --git a/README.md b/README.md index d0ed8a357..b3a049577 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,16 @@

- + This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) -## 🎉 News +## 🎉 News - [x] [2024.10.18]🎯🎯📢📢We’ve added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! - [x] [2024.10.17]🎯🎯📢📢We have created a [Discord channel](https://discord.gg/mvsfu2Tg)! Welcome to join for sharing and discussions! 🎉🎉 -- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! -- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! +- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! +- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! ## Install @@ -83,7 +83,7 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode=
Using Open AI-like APIs -LightRAG also support Open AI-like chat/embeddings APIs: +LightRAG also supports Open AI-like chat/embeddings APIs: ```python async def llm_model_func( prompt, system_prompt=None, history_messages=[], **kwargs @@ -120,7 +120,7 @@ rag = LightRAG(
Using Hugging Face Models - + If you want to use Hugging Face models, you only need to set LightRAG as follows: ```python from lightrag.llm import hf_model_complete, hf_embedding @@ -136,7 +136,7 @@ rag = LightRAG( embedding_dim=384, max_token_size=5000, func=lambda texts: hf_embedding( - texts, + texts, tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") ) @@ -148,7 +148,7 @@ rag = LightRAG(
Using Ollama Models If you want to use Ollama models, you only need to set LightRAG as follows: - + ```python from lightrag.llm import ollama_model_complete, ollama_embedding @@ -162,7 +162,7 @@ rag = LightRAG( embedding_dim=768, max_token_size=8192, func=lambda texts: ollama_embedding( - texts, + texts, embed_model="nomic-embed-text" ) ), @@ -187,14 +187,14 @@ with open("./newText.txt") as f: ``` ## Evaluation ### Dataset -The dataset used in LightRAG can be download from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). +The dataset used in LightRAG can be downloaded from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). ### Generate Query -LightRAG uses the following prompt to generate high-level queries, with the corresponding code located in `example/generate_query.py`. +LightRAG uses the following prompt to generate high-level queries, with the corresponding code in `example/generate_query.py`.
Prompt - + ```python Given the following description of a dataset: @@ -219,18 +219,18 @@ Output the results in the following structure: ... ```
- + ### Batch Eval To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`.
Prompt - + ```python ---Role--- You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. ---Goal--- -You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. +You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? @@ -294,7 +294,7 @@ Output your evaluation in the following JSON format: | **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | | **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | -## Reproduce +## Reproduce All the code can be found in the `./reproduce` directory. ### Step-0 Extract Unique Contexts @@ -302,7 +302,7 @@ First, we need to extract unique contexts in the datasets.
Code - + ```python def extract_unique_contexts(input_directory, output_directory): @@ -361,12 +361,12 @@ For the extracted contexts, we insert them into the LightRAG system.
Code - + ```python def insert_text(rag, file_path): with open(file_path, mode='r') as f: unique_contexts = json.load(f) - + retries = 0 max_retries = 3 while retries < max_retries: @@ -384,11 +384,11 @@ def insert_text(rag, file_path): ### Step-2 Generate Queries -We extract tokens from both the first half and the second half of each context in the dataset, then combine them as the dataset description to generate queries. +We extract tokens from the first and the second half of each context in the dataset, then combine them as dataset descriptions to generate queries.
Code - + ```python tokenizer = GPT2Tokenizer.from_pretrained('gpt2') @@ -401,7 +401,7 @@ def get_summary(context, tot_tokens=2000): summary_tokens = start_tokens + end_tokens summary = tokenizer.convert_tokens_to_string(summary_tokens) - + return summary ```
@@ -411,12 +411,12 @@ For the queries generated in Step-2, we will extract them and query LightRAG.
Code - + ```python def extract_queries(file_path): with open(file_path, 'r') as f: data = f.read() - + data = data.replace('**', '') queries = re.findall(r'- Question \d+: (.+)', data) @@ -470,7 +470,7 @@ def extract_queries(file_path): ```python @article{guo2024lightrag, -title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, +title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, year={2024}, eprint={2410.05779}, diff --git a/examples/batch_eval.py b/examples/batch_eval.py index 4601d2679..a85e1ede5 100644 --- a/examples/batch_eval.py +++ b/examples/batch_eval.py @@ -1,4 +1,3 @@ -import os import re import json import jsonlines @@ -9,28 +8,28 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): client = OpenAI() - with open(query_file, 'r') as f: + with open(query_file, "r") as f: data = f.read() - queries = re.findall(r'- Question \d+: (.+)', data) + queries = re.findall(r"- Question \d+: (.+)", data) - with open(result1_file, 'r') as f: + with open(result1_file, "r") as f: answers1 = json.load(f) - answers1 = [i['result'] for i in answers1] + answers1 = [i["result"] for i in answers1] - with open(result2_file, 'r') as f: + with open(result2_file, "r") as f: answers2 = json.load(f) - answers2 = [i['result'] for i in answers2] + answers2 = [i["result"] for i in answers2] requests = [] for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)): - sys_prompt = f""" + sys_prompt = """ ---Role--- You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. """ prompt = f""" - You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? @@ -69,7 +68,6 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): }} """ - request_data = { "custom_id": f"request-{i+1}", "method": "POST", @@ -78,22 +76,21 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): "model": "gpt-4o-mini", "messages": [ {"role": "system", "content": sys_prompt}, - {"role": "user", "content": prompt} + {"role": "user", "content": prompt}, ], - } + }, } - + requests.append(request_data) - with jsonlines.open(output_file_path, mode='w') as writer: + with jsonlines.open(output_file_path, mode="w") as writer: for request in requests: writer.write(request) print(f"Batch API requests written to {output_file_path}") batch_input_file = client.files.create( - file=open(output_file_path, "rb"), - purpose="batch" + file=open(output_file_path, "rb"), purpose="batch" ) batch_input_file_id = batch_input_file.id @@ -101,12 +98,11 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): input_file_id=batch_input_file_id, endpoint="/v1/chat/completions", completion_window="24h", - metadata={ - "description": "nightly eval job" - } + metadata={"description": "nightly eval job"}, ) - print(f'Batch {batch.id} has been created.') + print(f"Batch {batch.id} has been created.") + if __name__ == "__main__": - batch_eval() \ No newline at end of file + batch_eval() diff --git a/examples/generate_query.py b/examples/generate_query.py index 0ae82f407..705b23d3d 100644 --- a/examples/generate_query.py +++ b/examples/generate_query.py @@ -1,9 +1,8 @@ -import os - from openai import OpenAI # os.environ["OPENAI_API_KEY"] = "" + def openai_complete_if_cache( model="gpt-4o-mini", prompt=None, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -47,10 +46,10 @@ def openai_complete_if_cache( ... """ - result = openai_complete_if_cache(model='gpt-4o-mini', prompt=prompt) + result = openai_complete_if_cache(model="gpt-4o-mini", prompt=prompt) - file_path = f"./queries.txt" + file_path = "./queries.txt" with open(file_path, "w") as file: file.write(result) - print(f"Queries written to {file_path}") \ No newline at end of file + print(f"Queries written to {file_path}") diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py index 62282a258..e29a6a9d1 100644 --- a/examples/lightrag_azure_openai_demo.py +++ b/examples/lightrag_azure_openai_demo.py @@ -122,4 +122,4 @@ async def test_funcs(): print(rag.query(query_text, param=QueryParam(mode="global"))) print("\nResult (Hybrid):") -print(rag.query(query_text, param=QueryParam(mode="hybrid"))) \ No newline at end of file +print(rag.query(query_text, param=QueryParam(mode="hybrid"))) diff --git a/examples/lightrag_bedrock_demo.py b/examples/lightrag_bedrock_demo.py index c515922e0..7e18ea575 100644 --- a/examples/lightrag_bedrock_demo.py +++ b/examples/lightrag_bedrock_demo.py @@ -20,13 +20,11 @@ llm_model_func=bedrock_complete, llm_model_name="Anthropic Claude 3 Haiku // Amazon Bedrock", embedding_func=EmbeddingFunc( - embedding_dim=1024, - max_token_size=8192, - func=bedrock_embedding - ) + embedding_dim=1024, max_token_size=8192, func=bedrock_embedding + ), ) -with open("./book.txt", 'r', encoding='utf-8') as f: +with open("./book.txt", "r", encoding="utf-8") as f: rag.insert(f.read()) for mode in ["naive", "local", "global", "hybrid"]: @@ -34,8 +32,5 @@ print(f"| {mode.capitalize()} |") print("+-" + "-" * len(mode) + "-+\n") print( - rag.query( - "What are the top themes in this story?", - param=QueryParam(mode=mode) - ) + rag.query("What are the top themes in this story?", param=QueryParam(mode=mode)) ) diff --git a/examples/lightrag_hf_demo.py b/examples/lightrag_hf_demo.py index baf62bdbf..87312307e 100644 --- a/examples/lightrag_hf_demo.py +++ b/examples/lightrag_hf_demo.py @@ -1,10 +1,9 @@ import os -import sys from lightrag import LightRAG, QueryParam from lightrag.llm import hf_model_complete, hf_embedding from lightrag.utils import EmbeddingFunc -from transformers import AutoModel,AutoTokenizer +from transformers import AutoModel, AutoTokenizer WORKING_DIR = "./dickens" @@ -13,16 +12,20 @@ rag = LightRAG( working_dir=WORKING_DIR, - llm_model_func=hf_model_complete, - llm_model_name='meta-llama/Llama-3.1-8B-Instruct', + llm_model_func=hf_model_complete, + llm_model_name="meta-llama/Llama-3.1-8B-Instruct", embedding_func=EmbeddingFunc( embedding_dim=384, max_token_size=5000, func=lambda texts: hf_embedding( - texts, - tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), - embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - ) + texts, + tokenizer=AutoTokenizer.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2" + ), + embed_model=AutoModel.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2" + ), + ), ), ) @@ -31,13 +34,21 @@ rag.insert(f.read()) # Perform naive search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) # Perform local search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) # Perform global search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) # Perform hybrid search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_ollama_demo.py b/examples/lightrag_ollama_demo.py index a2d04aa64..c61b71c06 100644 --- a/examples/lightrag_ollama_demo.py +++ b/examples/lightrag_ollama_demo.py @@ -11,15 +11,12 @@ rag = LightRAG( working_dir=WORKING_DIR, - llm_model_func=ollama_model_complete, - llm_model_name='your_model_name', + llm_model_func=ollama_model_complete, + llm_model_name="your_model_name", embedding_func=EmbeddingFunc( embedding_dim=768, max_token_size=8192, - func=lambda texts: ollama_embedding( - texts, - embed_model="nomic-embed-text" - ) + func=lambda texts: ollama_embedding(texts, embed_model="nomic-embed-text"), ), ) @@ -28,13 +25,21 @@ rag.insert(f.read()) # Perform naive search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) # Perform local search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) # Perform global search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) # Perform hybrid search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_openai_compatible_demo.py b/examples/lightrag_openai_compatible_demo.py index 75ecc1180..fbad1190c 100644 --- a/examples/lightrag_openai_compatible_demo.py +++ b/examples/lightrag_openai_compatible_demo.py @@ -6,10 +6,11 @@ import numpy as np WORKING_DIR = "./dickens" - + if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) + async def llm_model_func( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -20,17 +21,19 @@ async def llm_model_func( history_messages=history_messages, api_key=os.getenv("UPSTAGE_API_KEY"), base_url="https://api.upstage.ai/v1/solar", - **kwargs + **kwargs, ) + async def embedding_func(texts: list[str]) -> np.ndarray: return await openai_embedding( texts, model="solar-embedding-1-large-query", api_key=os.getenv("UPSTAGE_API_KEY"), - base_url="https://api.upstage.ai/v1/solar" + base_url="https://api.upstage.ai/v1/solar", ) + # function test async def test_funcs(): result = await llm_model_func("How are you?") @@ -39,6 +42,7 @@ async def test_funcs(): result = await embedding_func(["How are you?"]) print("embedding_func: ", result) + asyncio.run(test_funcs()) @@ -46,10 +50,8 @@ async def test_funcs(): working_dir=WORKING_DIR, llm_model_func=llm_model_func, embedding_func=EmbeddingFunc( - embedding_dim=4096, - max_token_size=8192, - func=embedding_func - ) + embedding_dim=4096, max_token_size=8192, func=embedding_func + ), ) @@ -57,13 +59,21 @@ async def test_funcs(): rag.insert(f.read()) # Perform naive search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) # Perform local search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) # Perform global search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) # Perform hybrid search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_openai_demo.py b/examples/lightrag_openai_demo.py index fb1f055c3..a6e7f3b28 100644 --- a/examples/lightrag_openai_demo.py +++ b/examples/lightrag_openai_demo.py @@ -1,9 +1,7 @@ import os -import sys from lightrag import LightRAG, QueryParam -from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete -from transformers import AutoModel,AutoTokenizer +from lightrag.llm import gpt_4o_mini_complete WORKING_DIR = "./dickens" @@ -12,7 +10,7 @@ rag = LightRAG( working_dir=WORKING_DIR, - llm_model_func=gpt_4o_mini_complete + llm_model_func=gpt_4o_mini_complete, # llm_model_func=gpt_4o_complete ) @@ -21,13 +19,21 @@ rag.insert(f.read()) # Perform naive search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) # Perform local search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) # Perform global search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) # Perform hybrid search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index b6b953f1c..f208177fa 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,4 +1,4 @@ -from .lightrag import LightRAG, QueryParam +from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam __version__ = "0.0.6" __author__ = "Zirui Guo" diff --git a/lightrag/base.py b/lightrag/base.py index d677c406d..50be4f621 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -12,15 +12,16 @@ T = TypeVar("T") + @dataclass class QueryParam: mode: Literal["local", "global", "hybrid", "naive"] = "global" only_need_context: bool = False response_type: str = "Multiple Paragraphs" top_k: int = 60 - max_token_for_text_unit: int = 4000 + max_token_for_text_unit: int = 4000 max_token_for_global_context: int = 4000 - max_token_for_local_context: int = 4000 + max_token_for_local_context: int = 4000 @dataclass @@ -36,6 +37,7 @@ async def query_done_callback(self): """commit the storage operations after querying""" pass + @dataclass class BaseVectorStorage(StorageNameSpace): embedding_func: EmbeddingFunc @@ -50,6 +52,7 @@ async def upsert(self, data: dict[str, dict]): """ raise NotImplementedError + @dataclass class BaseKVStorage(Generic[T], StorageNameSpace): async def all_keys(self) -> list[str]: @@ -72,7 +75,7 @@ async def upsert(self, data: dict[str, T]): async def drop(self): raise NotImplementedError - + @dataclass class BaseGraphStorage(StorageNameSpace): @@ -113,4 +116,4 @@ async def clustering(self, algorithm: str): raise NotImplementedError async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: - raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file + raise NotImplementedError("Node embedding is not used in lightrag.") diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 83312ef6e..5137af427 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -3,10 +3,12 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Type, cast, Any -from transformers import AutoModel,AutoTokenizer, AutoModelForCausalLM +from typing import Type, cast -from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding, hf_model_complete, hf_embedding +from .llm import ( + gpt_4o_mini_complete, + openai_embedding, +) from .operate import ( chunking_by_token_size, extract_entities, @@ -37,6 +39,7 @@ QueryParam, ) + def always_get_an_event_loop() -> asyncio.AbstractEventLoop: try: loop = asyncio.get_running_loop() @@ -69,7 +72,6 @@ class LightRAG: "dimensions": 1536, "num_walks": 10, "walk_length": 40, - "num_walks": 10, "window_size": 2, "iterations": 3, "random_seed": 3, @@ -77,13 +79,13 @@ class LightRAG: ) # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding) - embedding_func: EmbeddingFunc = field(default_factory=lambda:openai_embedding) + embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) embedding_batch_num: int = 32 embedding_func_max_async: int = 16 # LLM - llm_model_func: callable = gpt_4o_mini_complete#hf_model_complete# - llm_model_name: str = 'meta-llama/Llama-3.2-1B-Instruct'#'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' + llm_model_func: callable = gpt_4o_mini_complete # hf_model_complete# + llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" #'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' llm_model_max_token_size: int = 32768 llm_model_max_async: int = 16 @@ -98,11 +100,11 @@ class LightRAG: addon_params: dict = field(default_factory=dict) convert_response_to_json_func: callable = convert_response_to_json - def __post_init__(self): + def __post_init__(self): log_file = os.path.join(self.working_dir, "lightrag.log") set_logger(log_file) logger.info(f"Logger initialized for working directory: {self.working_dir}") - + _print_config = ",\n ".join([f"{k} = {v}" for k, v in asdict(self).items()]) logger.debug(f"LightRAG init with param:\n {_print_config}\n") @@ -133,30 +135,24 @@ def __post_init__(self): self.embedding_func ) - self.entities_vdb = ( - self.vector_db_storage_cls( - namespace="entities", - global_config=asdict(self), - embedding_func=self.embedding_func, - meta_fields={"entity_name"} - ) + self.entities_vdb = self.vector_db_storage_cls( + namespace="entities", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"entity_name"}, ) - self.relationships_vdb = ( - self.vector_db_storage_cls( - namespace="relationships", - global_config=asdict(self), - embedding_func=self.embedding_func, - meta_fields={"src_id", "tgt_id"} - ) + self.relationships_vdb = self.vector_db_storage_cls( + namespace="relationships", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"src_id", "tgt_id"}, ) - self.chunks_vdb = ( - self.vector_db_storage_cls( - namespace="chunks", - global_config=asdict(self), - embedding_func=self.embedding_func, - ) + self.chunks_vdb = self.vector_db_storage_cls( + namespace="chunks", + global_config=asdict(self), + embedding_func=self.embedding_func, ) - + self.llm_model_func = limit_async_func_call(self.llm_model_max_async)( partial(self.llm_model_func, hashing_kv=self.llm_response_cache) ) @@ -177,7 +173,7 @@ async def ainsert(self, string_or_strings): _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} if not len(new_docs): - logger.warning(f"All docs are already in the storage") + logger.warning("All docs are already in the storage") return logger.info(f"[New Docs] inserting {len(new_docs)} docs") @@ -203,7 +199,7 @@ async def ainsert(self, string_or_strings): k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys } if not len(inserting_chunks): - logger.warning(f"All chunks are already in the storage") + logger.warning("All chunks are already in the storage") return logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") @@ -246,7 +242,7 @@ async def _insert_done(self): def query(self, query: str, param: QueryParam = QueryParam()): loop = always_get_an_event_loop() return loop.run_until_complete(self.aquery(query, param)) - + async def aquery(self, query: str, param: QueryParam = QueryParam()): if param.mode == "local": response = await local_query( @@ -290,7 +286,6 @@ async def aquery(self, query: str, param: QueryParam = QueryParam()): raise ValueError(f"Unknown mode {param.mode}") await self._query_done() return response - async def _query_done(self): tasks = [] @@ -299,5 +294,3 @@ async def _query_done(self): continue tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) await asyncio.gather(*tasks) - - diff --git a/lightrag/llm.py b/lightrag/llm.py index 48defb4dd..be801e0cf 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -1,9 +1,7 @@ import os import copy import json -import botocore import aioboto3 -import botocore.errorfactory import numpy as np import ollama from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout @@ -13,24 +11,34 @@ wait_exponential, retry_if_exception_type, ) -from transformers import AutoModel,AutoTokenizer, AutoModelForCausalLM +from transformers import AutoTokenizer, AutoModelForCausalLM import torch from .base import BaseKVStorage from .utils import compute_args_hash, wrap_embedding_func_with_attrs -import copy + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) async def openai_complete_if_cache( - model, prompt, system_prompt=None, history_messages=[], base_url=None, api_key=None, **kwargs + model, + prompt, + system_prompt=None, + history_messages=[], + base_url=None, + api_key=None, + **kwargs, ) -> str: if api_key: os.environ["OPENAI_API_KEY"] = api_key - openai_async_client = AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + openai_async_client = ( + AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + ) hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages = [] if system_prompt: @@ -64,43 +72,56 @@ class BedrockError(Exception): retry=retry_if_exception_type((BedrockError)), ) async def bedrock_complete_if_cache( - model, prompt, system_prompt=None, history_messages=[], - aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, **kwargs + model, + prompt, + system_prompt=None, + history_messages=[], + aws_access_key_id=None, + aws_secret_access_key=None, + aws_session_token=None, + **kwargs, ) -> str: - os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('AWS_ACCESS_KEY_ID', aws_access_key_id) - os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('AWS_SECRET_ACCESS_KEY', aws_secret_access_key) - os.environ['AWS_SESSION_TOKEN'] = os.environ.get('AWS_SESSION_TOKEN', aws_session_token) + os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get( + "AWS_ACCESS_KEY_ID", aws_access_key_id + ) + os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get( + "AWS_SECRET_ACCESS_KEY", aws_secret_access_key + ) + os.environ["AWS_SESSION_TOKEN"] = os.environ.get( + "AWS_SESSION_TOKEN", aws_session_token + ) # Fix message history format messages = [] for history_message in history_messages: message = copy.copy(history_message) - message['content'] = [{'text': message['content']}] + message["content"] = [{"text": message["content"]}] messages.append(message) # Add user prompt - messages.append({'role': "user", 'content': [{'text': prompt}]}) + messages.append({"role": "user", "content": [{"text": prompt}]}) # Initialize Converse API arguments - args = { - 'modelId': model, - 'messages': messages - } + args = {"modelId": model, "messages": messages} # Define system prompt if system_prompt: - args['system'] = [{'text': system_prompt}] + args["system"] = [{"text": system_prompt}] # Map and set up inference parameters inference_params_map = { - 'max_tokens': "maxTokens", - 'top_p': "topP", - 'stop_sequences': "stopSequences" + "max_tokens": "maxTokens", + "top_p": "topP", + "stop_sequences": "stopSequences", } - if (inference_params := list(set(kwargs) & set(['max_tokens', 'temperature', 'top_p', 'stop_sequences']))): - args['inferenceConfig'] = {} + if inference_params := list( + set(kwargs) & set(["max_tokens", "temperature", "top_p", "stop_sequences"]) + ): + args["inferenceConfig"] = {} for param in inference_params: - args['inferenceConfig'][inference_params_map.get(param, param)] = kwargs.pop(param) + args["inferenceConfig"][inference_params_map.get(param, param)] = ( + kwargs.pop(param) + ) hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) if hashing_kv is not None: @@ -112,31 +133,33 @@ async def bedrock_complete_if_cache( # Call model via Converse API session = aioboto3.Session() async with session.client("bedrock-runtime") as bedrock_async_client: - try: response = await bedrock_async_client.converse(**args, **kwargs) except Exception as e: raise BedrockError(e) if hashing_kv is not None: - await hashing_kv.upsert({ - args_hash: { - 'return': response['output']['message']['content'][0]['text'], - 'model': model + await hashing_kv.upsert( + { + args_hash: { + "return": response["output"]["message"]["content"][0]["text"], + "model": model, + } } - }) + ) + + return response["output"]["message"]["content"][0]["text"] - return response['output']['message']['content'][0]['text'] async def hf_model_if_cache( model, prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: model_name = model - hf_tokenizer = AutoTokenizer.from_pretrained(model_name,device_map = 'auto') - if hf_tokenizer.pad_token == None: + hf_tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto") + if hf_tokenizer.pad_token is None: # print("use eos token") hf_tokenizer.pad_token = hf_tokenizer.eos_token - hf_model = AutoModelForCausalLM.from_pretrained(model_name,device_map = 'auto') + hf_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages = [] if system_prompt: @@ -149,30 +172,51 @@ async def hf_model_if_cache( if_cache_return = await hashing_kv.get_by_id(args_hash) if if_cache_return is not None: return if_cache_return["return"] - input_prompt = '' + input_prompt = "" try: - input_prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - except: + input_prompt = hf_tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + except Exception: try: ori_message = copy.deepcopy(messages) - if messages[0]['role'] == "system": - messages[1]['content'] = "" + messages[0]['content'] + "\n" + messages[1]['content'] + if messages[0]["role"] == "system": + messages[1]["content"] = ( + "" + + messages[0]["content"] + + "\n" + + messages[1]["content"] + ) messages = messages[1:] - input_prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - except: + input_prompt = hf_tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + except Exception: len_message = len(ori_message) for msgid in range(len_message): - input_prompt =input_prompt+ '<'+ori_message[msgid]['role']+'>'+ori_message[msgid]['content']+'\n' - - input_ids = hf_tokenizer(input_prompt, return_tensors='pt', padding=True, truncation=True).to("cuda") - output = hf_model.generate(**input_ids, max_new_tokens=200, num_return_sequences=1,early_stopping = True) + input_prompt = ( + input_prompt + + "<" + + ori_message[msgid]["role"] + + ">" + + ori_message[msgid]["content"] + + "\n" + ) + + input_ids = hf_tokenizer( + input_prompt, return_tensors="pt", padding=True, truncation=True + ).to("cuda") + output = hf_model.generate( + **input_ids, max_new_tokens=200, num_return_sequences=1, early_stopping=True + ) response_text = hf_tokenizer.decode(output[0], skip_special_tokens=True) if hashing_kv is not None: - await hashing_kv.upsert( - {args_hash: {"return": response_text, "model": model}} - ) + await hashing_kv.upsert({args_hash: {"return": response_text, "model": model}}) return response_text + async def ollama_model_if_cache( model, prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -202,6 +246,7 @@ async def ollama_model_if_cache( return result + async def gpt_4o_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -241,7 +286,7 @@ async def bedrock_complete( async def hf_model_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: - model_name = kwargs['hashing_kv'].global_config['llm_model_name'] + model_name = kwargs["hashing_kv"].global_config["llm_model_name"] return await hf_model_if_cache( model_name, prompt, @@ -250,10 +295,11 @@ async def hf_model_complete( **kwargs, ) + async def ollama_model_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: - model_name = kwargs['hashing_kv'].global_config['llm_model_name'] + model_name = kwargs["hashing_kv"].global_config["llm_model_name"] return await ollama_model_if_cache( model_name, prompt, @@ -262,17 +308,25 @@ async def ollama_model_complete( **kwargs, ) + @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) -async def openai_embedding(texts: list[str], model: str = "text-embedding-3-small", base_url: str = None, api_key: str = None) -> np.ndarray: +async def openai_embedding( + texts: list[str], + model: str = "text-embedding-3-small", + base_url: str = None, + api_key: str = None, +) -> np.ndarray: if api_key: os.environ["OPENAI_API_KEY"] = api_key - openai_async_client = AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + openai_async_client = ( + AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + ) response = await openai_async_client.embeddings.create( model=model, input=texts, encoding_format="float" ) @@ -286,28 +340,37 @@ async def openai_embedding(texts: list[str], model: str = "text-embedding-3-smal # retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), # TODO: fix exceptions # ) async def bedrock_embedding( - texts: list[str], model: str = "amazon.titan-embed-text-v2:0", - aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None) -> np.ndarray: - os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('AWS_ACCESS_KEY_ID', aws_access_key_id) - os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('AWS_SECRET_ACCESS_KEY', aws_secret_access_key) - os.environ['AWS_SESSION_TOKEN'] = os.environ.get('AWS_SESSION_TOKEN', aws_session_token) + texts: list[str], + model: str = "amazon.titan-embed-text-v2:0", + aws_access_key_id=None, + aws_secret_access_key=None, + aws_session_token=None, +) -> np.ndarray: + os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get( + "AWS_ACCESS_KEY_ID", aws_access_key_id + ) + os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get( + "AWS_SECRET_ACCESS_KEY", aws_secret_access_key + ) + os.environ["AWS_SESSION_TOKEN"] = os.environ.get( + "AWS_SESSION_TOKEN", aws_session_token + ) session = aioboto3.Session() async with session.client("bedrock-runtime") as bedrock_async_client: - if (model_provider := model.split(".")[0]) == "amazon": embed_texts = [] for text in texts: if "v2" in model: - body = json.dumps({ - 'inputText': text, - # 'dimensions': embedding_dim, - 'embeddingTypes': ["float"] - }) + body = json.dumps( + { + "inputText": text, + # 'dimensions': embedding_dim, + "embeddingTypes": ["float"], + } + ) elif "v1" in model: - body = json.dumps({ - 'inputText': text - }) + body = json.dumps({"inputText": text}) else: raise ValueError(f"Model {model} is not supported!") @@ -315,29 +378,27 @@ async def bedrock_embedding( modelId=model, body=body, accept="application/json", - contentType="application/json" + contentType="application/json", ) - response_body = await response.get('body').json() + response_body = await response.get("body").json() - embed_texts.append(response_body['embedding']) + embed_texts.append(response_body["embedding"]) elif model_provider == "cohere": - body = json.dumps({ - 'texts': texts, - 'input_type': "search_document", - 'truncate': "NONE" - }) + body = json.dumps( + {"texts": texts, "input_type": "search_document", "truncate": "NONE"} + ) response = await bedrock_async_client.invoke_model( model=model, body=body, accept="application/json", - contentType="application/json" + contentType="application/json", ) - response_body = json.loads(response.get('body').read()) + response_body = json.loads(response.get("body").read()) - embed_texts = response_body['embeddings'] + embed_texts = response_body["embeddings"] else: raise ValueError(f"Model provider '{model_provider}' is not supported!") @@ -345,12 +406,15 @@ async def bedrock_embedding( async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: - input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids + input_ids = tokenizer( + texts, return_tensors="pt", padding=True, truncation=True + ).input_ids with torch.no_grad(): outputs = embed_model(input_ids) embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings.detach().numpy() + async def ollama_embedding(texts: list[str], embed_model) -> np.ndarray: embed_text = [] for text in texts: @@ -359,11 +423,12 @@ async def ollama_embedding(texts: list[str], embed_model) -> np.ndarray: return embed_text + if __name__ == "__main__": import asyncio async def main(): - result = await gpt_4o_mini_complete('How are you?') + result = await gpt_4o_mini_complete("How are you?") print(result) asyncio.run(main()) diff --git a/lightrag/operate.py b/lightrag/operate.py index 930ceb2a0..a0729cd81 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -25,6 +25,7 @@ ) from .prompt import GRAPH_FIELD_SEP, PROMPTS + def chunking_by_token_size( content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" ): @@ -45,6 +46,7 @@ def chunking_by_token_size( ) return results + async def _handle_entity_relation_summary( entity_or_relation_name: str, description: str, @@ -229,9 +231,10 @@ async def _merge_edges_then_upsert( description=description, keywords=keywords, ) - + return edge_data + async def extract_entities( chunks: dict[str, TextChunkSchema], knwoledge_graph_inst: BaseGraphStorage, @@ -352,7 +355,9 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): logger.warning("Didn't extract any entities, maybe your LLM is not working") return None if not len(all_relationships_data): - logger.warning("Didn't extract any relationships, maybe your LLM is not working") + logger.warning( + "Didn't extract any relationships, maybe your LLM is not working" + ) return None if entity_vdb is not None: @@ -370,7 +375,10 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { "src_id": dp["src_id"], "tgt_id": dp["tgt_id"], - "content": dp["keywords"] + dp["src_id"] + dp["tgt_id"] + dp["description"], + "content": dp["keywords"] + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], } for dp in all_relationships_data } @@ -378,6 +386,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): return knwoledge_graph_inst + async def local_query( query, knowledge_graph_inst: BaseGraphStorage, @@ -393,19 +402,24 @@ async def local_query( kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) result = await use_model_func(kw_prompt) - + try: keywords_data = json.loads(result) keywords = keywords_data.get("low_level_keywords", []) - keywords = ', '.join(keywords) - except json.JSONDecodeError as e: + keywords = ", ".join(keywords) + except json.JSONDecodeError: try: - result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip() - result = '{' + result.split('{')[1].split('}')[0] + '}' + result = ( + result.replace(kw_prompt[:-1], "") + .replace("user", "") + .replace("model", "") + .strip() + ) + result = "{" + result.split("{")[1].split("}")[0] + "}" keywords_data = json.loads(result) keywords = keywords_data.get("low_level_keywords", []) - keywords = ', '.join(keywords) + keywords = ", ".join(keywords) # Handle parsing error except json.JSONDecodeError as e: print(f"JSON parsing error: {e}") @@ -430,11 +444,20 @@ async def local_query( query, system_prompt=sys_prompt, ) - if len(response)>len(sys_prompt): - response = response.replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() - + if len(response) > len(sys_prompt): + response = ( + response.replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) + return response + async def _build_local_query_context( query, knowledge_graph_inst: BaseGraphStorage, @@ -516,6 +539,7 @@ async def _build_local_query_context( ``` """ + async def _find_most_related_text_unit_from_entities( node_datas: list[dict], query_param: QueryParam, @@ -576,6 +600,7 @@ async def _find_most_related_text_unit_from_entities( all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] return all_text_units + async def _find_most_related_edges_from_entities( node_datas: list[dict], query_param: QueryParam, @@ -609,6 +634,7 @@ async def _find_most_related_edges_from_entities( ) return all_edges_data + async def global_query( query, knowledge_graph_inst: BaseGraphStorage, @@ -624,20 +650,25 @@ async def global_query( kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) result = await use_model_func(kw_prompt) - + try: keywords_data = json.loads(result) keywords = keywords_data.get("high_level_keywords", []) - keywords = ', '.join(keywords) - except json.JSONDecodeError as e: + keywords = ", ".join(keywords) + except json.JSONDecodeError: try: - result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip() - result = '{' + result.split('{')[1].split('}')[0] + '}' + result = ( + result.replace(kw_prompt[:-1], "") + .replace("user", "") + .replace("model", "") + .strip() + ) + result = "{" + result.split("{")[1].split("}")[0] + "}" keywords_data = json.loads(result) keywords = keywords_data.get("high_level_keywords", []) - keywords = ', '.join(keywords) - + keywords = ", ".join(keywords) + except json.JSONDecodeError as e: # Handle parsing error print(f"JSON parsing error: {e}") @@ -651,12 +682,12 @@ async def global_query( text_chunks_db, query_param, ) - + if query_param.only_need_context: return context if context is None: return PROMPTS["fail_response"] - + sys_prompt_temp = PROMPTS["rag_response"] sys_prompt = sys_prompt_temp.format( context_data=context, response_type=query_param.response_type @@ -665,11 +696,20 @@ async def global_query( query, system_prompt=sys_prompt, ) - if len(response)>len(sys_prompt): - response = response.replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() - + if len(response) > len(sys_prompt): + response = ( + response.replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) + return response + async def _build_global_query_context( keywords, knowledge_graph_inst: BaseGraphStorage, @@ -679,14 +719,14 @@ async def _build_global_query_context( query_param: QueryParam, ): results = await relationships_vdb.query(keywords, top_k=query_param.top_k) - + if not len(results): return None - + edge_datas = await asyncio.gather( *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] ) - + if not all([n is not None for n in edge_datas]): logger.warning("Some edges are missing, maybe the storage is damaged") edge_degree = await asyncio.gather( @@ -765,6 +805,7 @@ async def _build_global_query_context( ``` """ + async def _find_most_related_entities_from_relationships( edge_datas: list[dict], query_param: QueryParam, @@ -774,7 +815,7 @@ async def _find_most_related_entities_from_relationships( for e in edge_datas: entity_names.add(e["src_id"]) entity_names.add(e["tgt_id"]) - + node_datas = await asyncio.gather( *[knowledge_graph_inst.get_node(entity_name) for entity_name in entity_names] ) @@ -795,13 +836,13 @@ async def _find_most_related_entities_from_relationships( return node_datas + async def _find_related_text_unit_from_relationships( edge_datas: list[dict], query_param: QueryParam, text_chunks_db: BaseKVStorage[TextChunkSchema], knowledge_graph_inst: BaseGraphStorage, ): - text_units = [ split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) for dp in edge_datas @@ -816,15 +857,13 @@ async def _find_related_text_unit_from_relationships( "data": await text_chunks_db.get_by_id(c_id), "order": index, } - + if any([v is None for v in all_text_units_lookup.values()]): logger.warning("Text chunks are missing, maybe the storage is damaged") all_text_units = [ {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None ] - all_text_units = sorted( - all_text_units, key=lambda x: x["order"] - ) + all_text_units = sorted(all_text_units, key=lambda x: x["order"]) all_text_units = truncate_list_by_token_size( all_text_units, key=lambda x: x["data"]["content"], @@ -834,6 +873,7 @@ async def _find_related_text_unit_from_relationships( return all_text_units + async def hybrid_query( query, knowledge_graph_inst: BaseGraphStorage, @@ -849,24 +889,29 @@ async def hybrid_query( kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) - + result = await use_model_func(kw_prompt) try: keywords_data = json.loads(result) hl_keywords = keywords_data.get("high_level_keywords", []) ll_keywords = keywords_data.get("low_level_keywords", []) - hl_keywords = ', '.join(hl_keywords) - ll_keywords = ', '.join(ll_keywords) - except json.JSONDecodeError as e: + hl_keywords = ", ".join(hl_keywords) + ll_keywords = ", ".join(ll_keywords) + except json.JSONDecodeError: try: - result = result.replace(kw_prompt[:-1],'').replace('user','').replace('model','').strip() - result = '{' + result.split('{')[1].split('}')[0] + '}' + result = ( + result.replace(kw_prompt[:-1], "") + .replace("user", "") + .replace("model", "") + .strip() + ) + result = "{" + result.split("{")[1].split("}")[0] + "}" keywords_data = json.loads(result) hl_keywords = keywords_data.get("high_level_keywords", []) ll_keywords = keywords_data.get("low_level_keywords", []) - hl_keywords = ', '.join(hl_keywords) - ll_keywords = ', '.join(ll_keywords) + hl_keywords = ", ".join(hl_keywords) + ll_keywords = ", ".join(ll_keywords) # Handle parsing error except json.JSONDecodeError as e: print(f"JSON parsing error: {e}") @@ -897,7 +942,7 @@ async def hybrid_query( return context if context is None: return PROMPTS["fail_response"] - + sys_prompt_temp = PROMPTS["rag_response"] sys_prompt = sys_prompt_temp.format( context_data=context, response_type=query_param.response_type @@ -906,53 +951,78 @@ async def hybrid_query( query, system_prompt=sys_prompt, ) - if len(response)>len(sys_prompt): - response = response.replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() + if len(response) > len(sys_prompt): + response = ( + response.replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) return response + def combine_contexts(high_level_context, low_level_context): # Function to extract entities, relationships, and sources from context strings def extract_sections(context): - entities_match = re.search(r'-----Entities-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - relationships_match = re.search(r'-----Relationships-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - sources_match = re.search(r'-----Sources-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - - entities = entities_match.group(1) if entities_match else '' - relationships = relationships_match.group(1) if relationships_match else '' - sources = sources_match.group(1) if sources_match else '' - + entities_match = re.search( + r"-----Entities-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL + ) + relationships_match = re.search( + r"-----Relationships-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL + ) + sources_match = re.search( + r"-----Sources-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL + ) + + entities = entities_match.group(1) if entities_match else "" + relationships = relationships_match.group(1) if relationships_match else "" + sources = sources_match.group(1) if sources_match else "" + return entities, relationships, sources - + # Extract sections from both contexts - if high_level_context==None: - warnings.warn("High Level context is None. Return empty High entity/relationship/source") - hl_entities, hl_relationships, hl_sources = '','','' + if high_level_context is None: + warnings.warn( + "High Level context is None. Return empty High entity/relationship/source" + ) + hl_entities, hl_relationships, hl_sources = "", "", "" else: hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) - - if low_level_context==None: - warnings.warn("Low Level context is None. Return empty Low entity/relationship/source") - ll_entities, ll_relationships, ll_sources = '','','' + if low_level_context is None: + warnings.warn( + "Low Level context is None. Return empty Low entity/relationship/source" + ) + ll_entities, ll_relationships, ll_sources = "", "", "" else: ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) - - # Combine and deduplicate the entities - combined_entities_set = set(filter(None, hl_entities.strip().split('\n') + ll_entities.strip().split('\n'))) - combined_entities = '\n'.join(combined_entities_set) - + combined_entities_set = set( + filter(None, hl_entities.strip().split("\n") + ll_entities.strip().split("\n")) + ) + combined_entities = "\n".join(combined_entities_set) + # Combine and deduplicate the relationships - combined_relationships_set = set(filter(None, hl_relationships.strip().split('\n') + ll_relationships.strip().split('\n'))) - combined_relationships = '\n'.join(combined_relationships_set) - + combined_relationships_set = set( + filter( + None, + hl_relationships.strip().split("\n") + ll_relationships.strip().split("\n"), + ) + ) + combined_relationships = "\n".join(combined_relationships_set) + # Combine and deduplicate the sources - combined_sources_set = set(filter(None, hl_sources.strip().split('\n') + ll_sources.strip().split('\n'))) - combined_sources = '\n'.join(combined_sources_set) - + combined_sources_set = set( + filter(None, hl_sources.strip().split("\n") + ll_sources.strip().split("\n")) + ) + combined_sources = "\n".join(combined_sources_set) + # Format the combined context return f""" -----Entities----- @@ -964,6 +1034,7 @@ def extract_sections(context): {combined_sources} """ + async def naive_query( query, chunks_vdb: BaseVectorStorage, @@ -996,8 +1067,16 @@ async def naive_query( system_prompt=sys_prompt, ) - if len(response)>len(sys_prompt): - response = response[len(sys_prompt):].replace(sys_prompt,'').replace('user','').replace('model','').replace(query,'').replace('','').replace('','').strip() - - return response + if len(response) > len(sys_prompt): + response = ( + response[len(sys_prompt) :] + .replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) + return response diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 5d28e49c5..6bd9b638f 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -9,9 +9,7 @@ PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] -PROMPTS[ - "entity_extraction" -] = """-Goal- +PROMPTS["entity_extraction"] = """-Goal- Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. -Steps- @@ -32,7 +30,7 @@ 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. Format the content-level key words as ("content_keywords"{tuple_delimiter}) - + 4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. 5. When finished, output {completion_delimiter} @@ -146,9 +144,7 @@ PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." -PROMPTS[ - "rag_response" -] = """---Role--- +PROMPTS["rag_response"] = """---Role--- You are a helpful assistant responding to questions about data in the tables provided. @@ -241,9 +237,7 @@ """ -PROMPTS[ - "naive_rag_response" -] = """You're a helpful assistant +PROMPTS["naive_rag_response"] = """You're a helpful assistant Below are the knowledge you know: {content_data} --- diff --git a/lightrag/storage.py b/lightrag/storage.py index 2f2bb7d8f..1f22fc565 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -1,16 +1,11 @@ import asyncio import html -import json import os -from collections import defaultdict -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Union, cast -import pickle -import hnswlib import networkx as nx import numpy as np from nano_vectordb import NanoVectorDB -import xxhash from .utils import load_json, logger, write_json from .base import ( @@ -19,6 +14,7 @@ BaseVectorStorage, ) + @dataclass class JsonKVStorage(BaseKVStorage): def __post_init__(self): @@ -59,12 +55,12 @@ async def upsert(self, data: dict[str, dict]): async def drop(self): self._data = {} + @dataclass class NanoVectorDBStorage(BaseVectorStorage): cosine_better_than_threshold: float = 0.2 def __post_init__(self): - self._client_file_name = os.path.join( self.global_config["working_dir"], f"vdb_{self.namespace}.json" ) @@ -118,6 +114,7 @@ async def query(self, query: str, top_k=5): async def index_done_callback(self): self._client.save() + @dataclass class NetworkXStorage(BaseGraphStorage): @staticmethod @@ -142,7 +139,9 @@ def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: graph = graph.copy() graph = cast(nx.Graph, largest_connected_component(graph)) - node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore + node_mapping = { + node: html.unescape(node.upper().strip()) for node in graph.nodes() + } # type: ignore graph = nx.relabel_nodes(graph, node_mapping) return NetworkXStorage._stabilize_graph(graph) diff --git a/lightrag/utils.py b/lightrag/utils.py index 9496cf34b..67d094c62 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -16,18 +16,22 @@ logger = logging.getLogger("lightrag") + def set_logger(log_file: str): logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler(log_file) file_handler.setLevel(logging.DEBUG) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) file_handler.setFormatter(formatter) if not logger.handlers: logger.addHandler(file_handler) + @dataclass class EmbeddingFunc: embedding_dim: int @@ -36,7 +40,8 @@ class EmbeddingFunc: async def __call__(self, *args, **kwargs) -> np.ndarray: return await self.func(*args, **kwargs) - + + def locate_json_string_body_from_string(content: str) -> Union[str, None]: """Locate the JSON string body from a string""" maybe_json_str = re.search(r"{.*}", content, re.DOTALL) @@ -45,6 +50,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]: else: return None + def convert_response_to_json(response: str) -> dict: json_str = locate_json_string_body_from_string(response) assert json_str is not None, f"Unable to parse JSON from response: {response}" @@ -55,12 +61,15 @@ def convert_response_to_json(response: str) -> dict: logger.error(f"Failed to parse JSON: {json_str}") raise e from None + def compute_args_hash(*args): return md5(str(args).encode()).hexdigest() + def compute_mdhash_id(content, prefix: str = ""): return prefix + md5(content.encode()).hexdigest() + def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): """Add restriction of maximum async calling times for a async func""" @@ -82,6 +91,7 @@ async def wait_func(*args, **kwargs): return final_decro + def wrap_embedding_func_with_attrs(**kwargs): """Wrap a function with attributes""" @@ -91,16 +101,19 @@ def final_decro(func) -> EmbeddingFunc: return final_decro + def load_json(file_name): if not os.path.exists(file_name): return None with open(file_name, encoding="utf-8") as f: return json.load(f) + def write_json(json_obj, file_name): with open(file_name, "w", encoding="utf-8") as f: json.dump(json_obj, f, indent=2, ensure_ascii=False) + def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): global ENCODER if ENCODER is None: @@ -116,12 +129,14 @@ def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): content = ENCODER.decode(tokens) return content + def pack_user_ass_to_openai_messages(*args: str): roles = ["user", "assistant"] return [ {"role": roles[i % 2], "content": content} for i, content in enumerate(args) ] + def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: """Split a string by multiple markers""" if not markers: @@ -129,6 +144,7 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str] results = re.split("|".join(re.escape(marker) for marker in markers), content) return [r.strip() for r in results if r.strip()] + # Refer the utils functions of the official GraphRAG implementation: # https://github.com/microsoft/graphrag def clean_str(input: Any) -> str: @@ -141,9 +157,11 @@ def clean_str(input: Any) -> str: # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + def is_float_regex(value): return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): """Truncate a list of data by token size""" if max_token_size <= 0: @@ -155,11 +173,13 @@ def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: return list_data[:i] return list_data + def list_of_list_to_csv(data: list[list]): return "\n".join( [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] ) + def save_data_to_file(data, file_name): - with open(file_name, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file + with open(file_name, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/reproduce/Step_0.py b/reproduce/Step_0.py index 9053aa40e..2d97bd14e 100644 --- a/reproduce/Step_0.py +++ b/reproduce/Step_0.py @@ -3,11 +3,11 @@ import glob import argparse -def extract_unique_contexts(input_directory, output_directory): +def extract_unique_contexts(input_directory, output_directory): os.makedirs(output_directory, exist_ok=True) - jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl')) + jsonl_files = glob.glob(os.path.join(input_directory, "*.jsonl")) print(f"Found {len(jsonl_files)} JSONL files.") for file_path in jsonl_files: @@ -21,18 +21,20 @@ def extract_unique_contexts(input_directory, output_directory): print(f"Processing file: {filename}") try: - with open(file_path, 'r', encoding='utf-8') as infile: + with open(file_path, "r", encoding="utf-8") as infile: for line_number, line in enumerate(infile, start=1): line = line.strip() if not line: continue try: json_obj = json.loads(line) - context = json_obj.get('context') + context = json_obj.get("context") if context and context not in unique_contexts_dict: unique_contexts_dict[context] = None except json.JSONDecodeError as e: - print(f"JSON decoding error in file {filename} at line {line_number}: {e}") + print( + f"JSON decoding error in file {filename} at line {line_number}: {e}" + ) except FileNotFoundError: print(f"File not found: {filename}") continue @@ -41,10 +43,12 @@ def extract_unique_contexts(input_directory, output_directory): continue unique_contexts_list = list(unique_contexts_dict.keys()) - print(f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.") + print( + f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}." + ) try: - with open(output_path, 'w', encoding='utf-8') as outfile: + with open(output_path, "w", encoding="utf-8") as outfile: json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) print(f"Unique `context` entries have been saved to: {output_filename}") except Exception as e: @@ -55,8 +59,10 @@ def extract_unique_contexts(input_directory, output_directory): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-i', '--input_dir', type=str, default='../datasets') - parser.add_argument('-o', '--output_dir', type=str, default='../datasets/unique_contexts') + parser.add_argument("-i", "--input_dir", type=str, default="../datasets") + parser.add_argument( + "-o", "--output_dir", type=str, default="../datasets/unique_contexts" + ) args = parser.parse_args() diff --git a/reproduce/Step_1.py b/reproduce/Step_1.py index 08e497cbd..43c44056d 100644 --- a/reproduce/Step_1.py +++ b/reproduce/Step_1.py @@ -4,10 +4,11 @@ from lightrag import LightRAG + def insert_text(rag, file_path): - with open(file_path, mode='r') as f: + with open(file_path, mode="r") as f: unique_contexts = json.load(f) - + retries = 0 max_retries = 3 while retries < max_retries: @@ -21,6 +22,7 @@ def insert_text(rag, file_path): if retries == max_retries: print("Insertion failed after exceeding the maximum number of retries") + cls = "agriculture" WORKING_DIR = "../{cls}" @@ -29,4 +31,4 @@ def insert_text(rag, file_path): rag = LightRAG(working_dir=WORKING_DIR) -insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") \ No newline at end of file +insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") diff --git a/reproduce/Step_1_openai_compatible.py b/reproduce/Step_1_openai_compatible.py index b5c6aef3c..8e67cfb8b 100644 --- a/reproduce/Step_1_openai_compatible.py +++ b/reproduce/Step_1_openai_compatible.py @@ -7,6 +7,7 @@ from lightrag.utils import EmbeddingFunc from lightrag.llm import openai_complete_if_cache, openai_embedding + ## For Upstage API # please check if embedding_dim=4096 in lightrag.py and llm.py in lightrag direcotry async def llm_model_func( @@ -19,22 +20,26 @@ async def llm_model_func( history_messages=history_messages, api_key=os.getenv("UPSTAGE_API_KEY"), base_url="https://api.upstage.ai/v1/solar", - **kwargs + **kwargs, ) + async def embedding_func(texts: list[str]) -> np.ndarray: return await openai_embedding( texts, model="solar-embedding-1-large-query", api_key=os.getenv("UPSTAGE_API_KEY"), - base_url="https://api.upstage.ai/v1/solar" + base_url="https://api.upstage.ai/v1/solar", ) + + ## /For Upstage API + def insert_text(rag, file_path): - with open(file_path, mode='r') as f: + with open(file_path, mode="r") as f: unique_contexts = json.load(f) - + retries = 0 max_retries = 3 while retries < max_retries: @@ -48,19 +53,19 @@ def insert_text(rag, file_path): if retries == max_retries: print("Insertion failed after exceeding the maximum number of retries") + cls = "mix" WORKING_DIR = f"../{cls}" if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) -rag = LightRAG(working_dir=WORKING_DIR, - llm_model_func=llm_model_func, - embedding_func=EmbeddingFunc( - embedding_dim=4096, - max_token_size=8192, - func=embedding_func - ) - ) +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, max_token_size=8192, func=embedding_func + ), +) insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") diff --git a/reproduce/Step_2.py b/reproduce/Step_2.py index b00c19b8e..557c77147 100644 --- a/reproduce/Step_2.py +++ b/reproduce/Step_2.py @@ -1,8 +1,8 @@ -import os import json from openai import OpenAI from transformers import GPT2Tokenizer + def openai_complete_if_cache( model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -19,24 +19,26 @@ def openai_complete_if_cache( ) return response.choices[0].message.content -tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + def get_summary(context, tot_tokens=2000): tokens = tokenizer.tokenize(context) half_tokens = tot_tokens // 2 - start_tokens = tokens[1000:1000 + half_tokens] - end_tokens = tokens[-(1000 + half_tokens):1000] + start_tokens = tokens[1000 : 1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens) : 1000] summary_tokens = start_tokens + end_tokens summary = tokenizer.convert_tokens_to_string(summary_tokens) - + return summary -clses = ['agriculture'] +clses = ["agriculture"] for cls in clses: - with open(f'../datasets/unique_contexts/{cls}_unique_contexts.json', mode='r') as f: + with open(f"../datasets/unique_contexts/{cls}_unique_contexts.json", mode="r") as f: unique_contexts = json.load(f) summaries = [get_summary(context) for context in unique_contexts] @@ -67,10 +69,10 @@ def get_summary(context, tot_tokens=2000): ... """ - result = openai_complete_if_cache(model='gpt-4o', prompt=prompt) + result = openai_complete_if_cache(model="gpt-4o", prompt=prompt) file_path = f"../datasets/questions/{cls}_questions.txt" with open(file_path, "w") as file: file.write(result) - print(f"{cls}_questions written to {file_path}") \ No newline at end of file + print(f"{cls}_questions written to {file_path}") diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py index a79ebd17a..a56190fc6 100644 --- a/reproduce/Step_3.py +++ b/reproduce/Step_3.py @@ -4,16 +4,18 @@ from lightrag import LightRAG, QueryParam from tqdm import tqdm + def extract_queries(file_path): - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = f.read() - - data = data.replace('**', '') - queries = re.findall(r'- Question \d+: (.+)', data) + data = data.replace("**", "") + + queries = re.findall(r"- Question \d+: (.+)", data) return queries + async def process_query(query_text, rag_instance, query_param): try: result, context = await rag_instance.aquery(query_text, param=query_param) @@ -21,6 +23,7 @@ async def process_query(query_text, rag_instance, query_param): except Exception as e: return None, {"query": query_text, "error": str(e)} + def always_get_an_event_loop() -> asyncio.AbstractEventLoop: try: loop = asyncio.get_event_loop() @@ -29,15 +32,22 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: asyncio.set_event_loop(loop) return loop -def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file, error_file): + +def run_queries_and_save_to_json( + queries, rag_instance, query_param, output_file, error_file +): loop = always_get_an_event_loop() - with open(output_file, 'a', encoding='utf-8') as result_file, open(error_file, 'a', encoding='utf-8') as err_file: + with open(output_file, "a", encoding="utf-8") as result_file, open( + error_file, "a", encoding="utf-8" + ) as err_file: result_file.write("[\n") first_entry = True for query_text in tqdm(queries, desc="Processing queries", unit="query"): - result, error = loop.run_until_complete(process_query(query_text, rag_instance, query_param)) + result, error = loop.run_until_complete( + process_query(query_text, rag_instance, query_param) + ) if result: if not first_entry: @@ -50,6 +60,7 @@ def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file result_file.write("\n]") + if __name__ == "__main__": cls = "agriculture" mode = "hybrid" @@ -59,4 +70,6 @@ def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file query_param = QueryParam(mode=mode) queries = extract_queries(f"../datasets/questions/{cls}_questions.txt") - run_queries_and_save_to_json(queries, rag, query_param, f"{cls}_result.json", f"{cls}_errors.json") + run_queries_and_save_to_json( + queries, rag, query_param, f"{cls}_result.json", f"{cls}_errors.json" + ) diff --git a/reproduce/Step_3_openai_compatible.py b/reproduce/Step_3_openai_compatible.py index 7b3079a9b..2be5ea5cb 100644 --- a/reproduce/Step_3_openai_compatible.py +++ b/reproduce/Step_3_openai_compatible.py @@ -8,6 +8,7 @@ from lightrag.utils import EmbeddingFunc import numpy as np + ## For Upstage API # please check if embedding_dim=4096 in lightrag.py and llm.py in lightrag direcotry async def llm_model_func( @@ -20,28 +21,33 @@ async def llm_model_func( history_messages=history_messages, api_key=os.getenv("UPSTAGE_API_KEY"), base_url="https://api.upstage.ai/v1/solar", - **kwargs + **kwargs, ) + async def embedding_func(texts: list[str]) -> np.ndarray: return await openai_embedding( texts, model="solar-embedding-1-large-query", api_key=os.getenv("UPSTAGE_API_KEY"), - base_url="https://api.upstage.ai/v1/solar" + base_url="https://api.upstage.ai/v1/solar", ) + + ## /For Upstage API + def extract_queries(file_path): - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = f.read() - - data = data.replace('**', '') - queries = re.findall(r'- Question \d+: (.+)', data) + data = data.replace("**", "") + + queries = re.findall(r"- Question \d+: (.+)", data) return queries + async def process_query(query_text, rag_instance, query_param): try: result, context = await rag_instance.aquery(query_text, param=query_param) @@ -49,6 +55,7 @@ async def process_query(query_text, rag_instance, query_param): except Exception as e: return None, {"query": query_text, "error": str(e)} + def always_get_an_event_loop() -> asyncio.AbstractEventLoop: try: loop = asyncio.get_event_loop() @@ -57,15 +64,22 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: asyncio.set_event_loop(loop) return loop -def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file, error_file): + +def run_queries_and_save_to_json( + queries, rag_instance, query_param, output_file, error_file +): loop = always_get_an_event_loop() - with open(output_file, 'a', encoding='utf-8') as result_file, open(error_file, 'a', encoding='utf-8') as err_file: + with open(output_file, "a", encoding="utf-8") as result_file, open( + error_file, "a", encoding="utf-8" + ) as err_file: result_file.write("[\n") first_entry = True for query_text in tqdm(queries, desc="Processing queries", unit="query"): - result, error = loop.run_until_complete(process_query(query_text, rag_instance, query_param)) + result, error = loop.run_until_complete( + process_query(query_text, rag_instance, query_param) + ) if result: if not first_entry: @@ -78,22 +92,24 @@ def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file result_file.write("\n]") + if __name__ == "__main__": cls = "mix" mode = "hybrid" WORKING_DIR = f"../{cls}" rag = LightRAG(working_dir=WORKING_DIR) - rag = LightRAG(working_dir=WORKING_DIR, - llm_model_func=llm_model_func, - embedding_func=EmbeddingFunc( - embedding_dim=4096, - max_token_size=8192, - func=embedding_func - ) - ) + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, max_token_size=8192, func=embedding_func + ), + ) query_param = QueryParam(mode=mode) - base_dir='../datasets/questions' + base_dir = "../datasets/questions" queries = extract_queries(f"{base_dir}/{cls}_questions.txt") - run_queries_and_save_to_json(queries, rag, query_param, f"{base_dir}/result.json", f"{base_dir}/errors.json") + run_queries_and_save_to_json( + queries, rag, query_param, f"{base_dir}/result.json", f"{base_dir}/errors.json" + ) diff --git a/requirements.txt b/requirements.txt index a1054692a..d5479dab1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ +accelerate aioboto3 -openai -tiktoken -networkx graspologic -nano-vectordb hnswlib -xxhash +nano-vectordb +networkx +ollama +openai tenacity -transformers +tiktoken torch -ollama -accelerate \ No newline at end of file +transformers +xxhash From 4945027dc025c73763ecc271017152273a81d86d Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Sat, 19 Oct 2024 21:35:50 +0800 Subject: [PATCH 58/67] Update README.md --- README.md | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a3e5c1b4d..e2f7e81a7 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode=
Using Open AI-like APIs -LightRAG also supports Open AI-like chat/embeddings APIs: +* LightRAG also supports Open AI-like chat/embeddings APIs: ```python async def llm_model_func( prompt, system_prompt=None, history_messages=[], **kwargs @@ -130,7 +130,7 @@ rag = LightRAG(
Using Hugging Face Models -If you want to use Hugging Face models, you only need to set LightRAG as follows: +* If you want to use Hugging Face models, you only need to set LightRAG as follows: ```python from lightrag.llm import hf_model_complete, hf_embedding from transformers import AutoModel, AutoTokenizer @@ -156,7 +156,8 @@ rag = LightRAG(
Using Ollama Models -If you want to use Ollama models, you only need to set LightRAG as follows: + +* If you want to use Ollama models, you only need to set LightRAG as follows: ```python from lightrag.llm import ollama_model_complete, ollama_embedding @@ -177,6 +178,29 @@ rag = LightRAG( ), ) ``` + +* Increasing the `num_ctx` parameter: + +1. Pull the model: +```python +ollama pull qwen2 +``` + +2. Display the model file: +```python +ollama show --modelfile qwen2 > Modelfile +``` + +3. Edit the Modelfile by adding the following line: +```python +PARAMETER num_ctx 32768 +``` + +4. Create the modified model: +```python +ollama create -f Modelfile qwen2m +``` +
### Batch Insert @@ -441,6 +465,8 @@ def extract_queries(file_path): ├── examples │ ├── batch_eval.py │ ├── generate_query.py +│ ├── lightrag_azure_openai_demo.py +│ ├── lightrag_bedrock_demo.py │ ├── lightrag_hf_demo.py │ ├── lightrag_ollama_demo.py │ ├── lightrag_openai_compatible_demo.py @@ -459,6 +485,8 @@ def extract_queries(file_path): │ ├── Step_1.py │ ├── Step_2.py │ └── Step_3.py +├── .gitignore +├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── requirements.txt From 263cde887156fa2d6108fa8463fdfd16b4b52fb1 Mon Sep 17 00:00:00 2001 From: nongbin Date: Sun, 20 Oct 2024 09:55:52 +0800 Subject: [PATCH 59/67] add visualizing graph --- .gitignore | 1 + .idea/.gitignore | 8 ++++ .idea/LightRAG.iml | 12 ++++++ .idea/inspectionProfiles/Project_Default.xml | 38 +++++++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 +++ .idea/misc.xml | 7 ++++ .idea/modules.xml | 8 ++++ .idea/vcs.xml | 6 +++ examples/graph_visual.py | 14 +++++++ requirements.txt | 1 + 10 files changed, 101 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/LightRAG.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 examples/graph_visual.py diff --git a/.gitignore b/.gitignore index 50f384ec3..208668c5b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__ dickens/ book.txt lightrag-dev/ +*.idea \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 000000000..13566b81b --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/LightRAG.iml b/.idea/LightRAG.iml new file mode 100644 index 000000000..8b8c39547 --- /dev/null +++ b/.idea/LightRAG.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 000000000..c41eaf208 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,38 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 000000000..105ce2da2 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..676ac0f0f --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 000000000..145d7086c --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..35eb1ddfb --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/examples/graph_visual.py b/examples/graph_visual.py new file mode 100644 index 000000000..72c72bad0 --- /dev/null +++ b/examples/graph_visual.py @@ -0,0 +1,14 @@ +import networkx as nx +from pyvis.network import Network + +# Load the GraphML file +G = nx.read_graphml('./dickens/graph_chunk_entity_relation.graphml') + +# Create a Pyvis network +net = Network(notebook=True) + +# Convert NetworkX graph to Pyvis network +net.from_nx(G) + +# Save and display the network +net.show('knowledge_graph.html') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d5479dab1..9cc5b7e95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ tiktoken torch transformers xxhash +pyvis \ No newline at end of file From a7e43406a5d6113a5a0483b187652c74868a21b2 Mon Sep 17 00:00:00 2001 From: nongbin Date: Sun, 20 Oct 2024 09:57:14 +0800 Subject: [PATCH 60/67] delete not used files --- .idea/.gitignore | 8 ---- .idea/LightRAG.iml | 12 ------ .idea/inspectionProfiles/Project_Default.xml | 38 ------------------- .../inspectionProfiles/profiles_settings.xml | 6 --- .idea/misc.xml | 7 ---- .idea/modules.xml | 8 ---- .idea/vcs.xml | 6 --- 7 files changed, 85 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/LightRAG.iml delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b81b..000000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/LightRAG.iml b/.idea/LightRAG.iml deleted file mode 100644 index 8b8c39547..000000000 --- a/.idea/LightRAG.iml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index c41eaf208..000000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2..000000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 676ac0f0f..000000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 145d7086c..000000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1ddfb..000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From c6585ff89f858b8d39de3eb5d4b71d59a0771a47 Mon Sep 17 00:00:00 2001 From: nongbin Date: Sun, 20 Oct 2024 10:04:34 +0800 Subject: [PATCH 61/67] ignore idea files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 208668c5b..edfbfbfcc 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ __pycache__ dickens/ book.txt lightrag-dev/ -*.idea \ No newline at end of file +.idea/ \ No newline at end of file From 347e8a97be3ee4e1b87ad0b16f7060e4643132a6 Mon Sep 17 00:00:00 2001 From: hanbin49 <554066527@qq.com> Date: Sun, 20 Oct 2024 11:27:47 +0800 Subject: [PATCH 62/67] 'update' --- examples/vram_management_demo.py | 82 ++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 examples/vram_management_demo.py diff --git a/examples/vram_management_demo.py b/examples/vram_management_demo.py new file mode 100644 index 000000000..505e47617 --- /dev/null +++ b/examples/vram_management_demo.py @@ -0,0 +1,82 @@ +import os +import time +from lightrag import LightRAG, QueryParam +from lightrag.llm import ollama_model_complete, ollama_embedding +from lightrag.utils import EmbeddingFunc + +# 工作目录和文本文件目录路径 +WORKING_DIR = "./dickens" +TEXT_FILES_DIR = "/llm/mt" + +# 如果工作目录不存在,则创建该目录 +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +# 初始化 LightRAG +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=ollama_model_complete, + llm_model_name="qwen2.5:3b-instruct-max-context", + embedding_func=EmbeddingFunc( + embedding_dim=768, + max_token_size=8192, + func=lambda texts: ollama_embedding(texts, embed_model="nomic-embed-text"), + ), +) + +# 读取 TEXT_FILES_DIR 目录下所有的 .txt 文件 +texts = [] +for filename in os.listdir(TEXT_FILES_DIR): + if filename.endswith('.txt'): + file_path = os.path.join(TEXT_FILES_DIR, filename) + with open(file_path, 'r', encoding='utf-8') as file: + texts.append(file.read()) + +# 批量插入文本到 LightRAG,带有重试机制 +def insert_texts_with_retry(rag, texts, retries=3, delay=5): + for _ in range(retries): + try: + rag.insert(texts) + return + except Exception as e: + print(f"Error occurred during insertion: {e}. Retrying in {delay} seconds...") + time.sleep(delay) + raise RuntimeError("Failed to insert texts after multiple retries.") + +insert_texts_with_retry(rag, texts) + +# 执行不同类型的查询,并处理潜在的错误 +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +except Exception as e: + print(f"Error performing naive search: {e}") + +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +except Exception as e: + print(f"Error performing local search: {e}") + +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +except Exception as e: + print(f"Error performing global search: {e}") + +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +except Exception as e: + print(f"Error performing hybrid search: {e}") + +# 清理 VRAM 资源的函数 +def clear_vram(): + os.system("sudo nvidia-smi --gpu-reset") + +# 定期清理 VRAM 以防止溢出 +clear_vram_interval = 3600 # 每小时清理一次 +start_time = time.time() + +while True: + current_time = time.time() + if current_time - start_time > clear_vram_interval: + clear_vram() + start_time = current_time + time.sleep(60) # 每分钟检查一次时间 \ No newline at end of file From a716e628e370719e0fdcb847e4cd9b4212cc72eb Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:08:49 +0800 Subject: [PATCH 63/67] Add vram_management_demo.py --- examples/vram_management_demo.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/vram_management_demo.py b/examples/vram_management_demo.py index 505e47617..ec7502549 100644 --- a/examples/vram_management_demo.py +++ b/examples/vram_management_demo.py @@ -4,15 +4,15 @@ from lightrag.llm import ollama_model_complete, ollama_embedding from lightrag.utils import EmbeddingFunc -# 工作目录和文本文件目录路径 +# Working directory and the directory path for text files WORKING_DIR = "./dickens" TEXT_FILES_DIR = "/llm/mt" -# 如果工作目录不存在,则创建该目录 +# Create the working directory if it doesn't exist if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) -# 初始化 LightRAG +# Initialize LightRAG rag = LightRAG( working_dir=WORKING_DIR, llm_model_func=ollama_model_complete, @@ -24,7 +24,7 @@ ), ) -# 读取 TEXT_FILES_DIR 目录下所有的 .txt 文件 +# Read all .txt files from the TEXT_FILES_DIR directory texts = [] for filename in os.listdir(TEXT_FILES_DIR): if filename.endswith('.txt'): @@ -32,7 +32,7 @@ with open(file_path, 'r', encoding='utf-8') as file: texts.append(file.read()) -# 批量插入文本到 LightRAG,带有重试机制 +# Batch insert texts into LightRAG with a retry mechanism def insert_texts_with_retry(rag, texts, retries=3, delay=5): for _ in range(retries): try: @@ -45,7 +45,7 @@ def insert_texts_with_retry(rag, texts, retries=3, delay=5): insert_texts_with_retry(rag, texts) -# 执行不同类型的查询,并处理潜在的错误 +# Perform different types of queries and handle potential errors try: print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) except Exception as e: @@ -66,12 +66,12 @@ def insert_texts_with_retry(rag, texts, retries=3, delay=5): except Exception as e: print(f"Error performing hybrid search: {e}") -# 清理 VRAM 资源的函数 +# Function to clear VRAM resources def clear_vram(): os.system("sudo nvidia-smi --gpu-reset") -# 定期清理 VRAM 以防止溢出 -clear_vram_interval = 3600 # 每小时清理一次 +# Regularly clear VRAM to prevent overflow +clear_vram_interval = 3600 # Clear once every hour start_time = time.time() while True: @@ -79,4 +79,4 @@ def clear_vram(): if current_time - start_time > clear_vram_interval: clear_vram() start_time = current_time - time.sleep(60) # 每分钟检查一次时间 \ No newline at end of file + time.sleep(60) # Check the time every minute From ae4aafb525b2366499b1d9cf5dd2e92731464569 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:10:00 +0800 Subject: [PATCH 64/67] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e2f7e81a7..bf996f82a 100644 --- a/README.md +++ b/README.md @@ -470,7 +470,8 @@ def extract_queries(file_path): │ ├── lightrag_hf_demo.py │ ├── lightrag_ollama_demo.py │ ├── lightrag_openai_compatible_demo.py -│ └── lightrag_openai_demo.py +│ ├── lightrag_openai_demo.py +│ └── vram_management_demo.py ├── lightrag │ ├── __init__.py │ ├── base.py From c800fa48435fab8d2aca945e68d5f9f52c988f9e Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:22:43 +0800 Subject: [PATCH 65/67] Update README.md --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index bf996f82a..c8d6e312d 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,26 @@ rag = LightRAG(working_dir="./dickens") with open("./newText.txt") as f: rag.insert(f.read()) ``` + +### Graph Visualization + +* Generate html file +```python +import networkx as nx +from pyvis.network import Network + +# Load the GraphML file +G = nx.read_graphml('./dickens/graph_chunk_entity_relation.graphml') + +# Create a Pyvis network +net = Network(notebook=True) + +# Convert NetworkX graph to Pyvis network +net.from_nx(G) + +# Save and display the network +net.show('knowledge_graph.html') +``` ## Evaluation ### Dataset The dataset used in LightRAG can be downloaded from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). @@ -465,6 +485,7 @@ def extract_queries(file_path): ├── examples │ ├── batch_eval.py │ ├── generate_query.py +│ ├── graph_visual.py │ ├── lightrag_azure_openai_demo.py │ ├── lightrag_bedrock_demo.py │ ├── lightrag_hf_demo.py From f400b02b0f23401907a1aab004ab7bbc39615364 Mon Sep 17 00:00:00 2001 From: nongbin Date: Sun, 20 Oct 2024 21:17:09 +0800 Subject: [PATCH 66/67] make graph visualization become colorful --- examples/graph_visual.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/graph_visual.py b/examples/graph_visual.py index 72c72bad0..b455e6de9 100644 --- a/examples/graph_visual.py +++ b/examples/graph_visual.py @@ -1,5 +1,6 @@ import networkx as nx from pyvis.network import Network +import random # Load the GraphML file G = nx.read_graphml('./dickens/graph_chunk_entity_relation.graphml') @@ -10,5 +11,9 @@ # Convert NetworkX graph to Pyvis network net.from_nx(G) +# Add colors to nodes +for node in net.nodes: + node['color'] = "#{:06x}".format(random.randint(0, 0xFFFFFF)) + # Save and display the network net.show('knowledge_graph.html') \ No newline at end of file From 2ddec8371fe87d4c971c326c691393544b1481ff Mon Sep 17 00:00:00 2001 From: ivs Date: Mon, 21 Oct 2024 14:59:51 +0530 Subject: [PATCH 67/67] file processing, fileinfo in chunk meta, prompts config --- lightrag/base.py | 6 + lightrag/lightrag.py | 104 ++++++++++++- lightrag/operate.py | 82 ++++++++-- lightrag/prompt.py | 237 ++--------------------------- lightrag/prompts/code.toml | 272 ++++++++++++++++++++++++++++++++++ lightrag/prompts/default.toml | 228 ++++++++++++++++++++++++++++ lightrag/storage.py | 6 +- lightrag/utils.py | 10 ++ 8 files changed, 706 insertions(+), 239 deletions(-) create mode 100644 lightrag/prompts/code.toml create mode 100644 lightrag/prompts/default.toml diff --git a/lightrag/base.py b/lightrag/base.py index 50be4f621..f8bba04d5 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -52,7 +52,13 @@ async def upsert(self, data: dict[str, dict]): """ raise NotImplementedError + def dump(self): + """For debug purposes + Raises: + NotImplementedError: implement in subclass + """ + raise NotImplementedError @dataclass class BaseKVStorage(Generic[T], StorageNameSpace): async def all_keys(self) -> list[str]: diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5137af427..d2d1e01e8 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1,9 +1,10 @@ import asyncio import os +import textwrap from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Type, cast +from typing import Type, List, Union, cast from .llm import ( gpt_4o_mini_complete, @@ -157,11 +158,108 @@ def __post_init__(self): partial(self.llm_model_func, hashing_kv=self.llm_response_cache) ) - def insert(self, string_or_strings): + def insert(self, input_data: Union[str, os.PathLike, List[Union[str, os.PathLike]]]): + """Insert content from a string, file path, or a list of them.""" + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert(input_data)) + + async def ainsert(self, input_data: Union[str, os.PathLike, List[Union[str, os.PathLike]]]): + """Asynchronously handle inserting content from strings or file paths.""" + try: + # Ensure input is treated as a list + if isinstance(input_data, (str, os.PathLike)): + input_data = [input_data] + + contents = [] + # Process each item: read from file or use the string directly + for item in input_data: + if isinstance(item, os.PathLike) and os.path.isfile(item): + with open(item, 'r') as f: + content = f.read().strip() + contents.append((content, os.path.basename(item), os.path.abspath(item))) + else: + contents.append((item.strip(), "!none", "!none")) + + # Create documents with hashed keys + new_docs = { + compute_mdhash_id(content.strip(), prefix="doc-"): { + "content": content, + "filename": filename, + "filepath": filepath + } + for content, filename, filepath in contents + } + + # Filter out already stored documents + _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not new_docs: + logger.warning("All docs are already in the storage") + return + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + # Chunk documents with metadata prefixes for each chunk + inserting_chunks = {} + for doc_key, doc in new_docs.items(): + content = doc["content"] + filename = doc["filename"] + filepath = doc["filepath"] + + # Generate chunks with metadata directly + chunks = { + compute_mdhash_id(f"{doc_key}-{i}", prefix="chunk-"): { + **dp, + "content": dp["content"], + "full_doc_id": doc_key, + } + for i, dp in enumerate(chunking_by_token_size( + content, + overlap_token_size=self.chunk_overlap_token_size, + max_token_size=self.chunk_token_size, + tiktoken_model=self.tiktoken_model_name, + filename=filename, + filepath=filepath + )) + } + inserting_chunks.update(chunks) + + # Filter out already stored chunks + _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys())) + inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys} + if not inserting_chunks: + logger.warning("All chunks are already in the storage") + return + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + + # Insert chunks into vector database + await self.chunks_vdb.upsert(inserting_chunks) + + # Extract entities and relationships + logger.info("[Entity Extraction]...") + maybe_new_kg = await extract_entities( + inserting_chunks, + knwoledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=asdict(self), + ) + if maybe_new_kg is None: + logger.warning("No new entities and relationships found") + return + self.chunk_entity_relation_graph = maybe_new_kg + + # Upsert documents and chunks into their storages + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + + finally: + await self._insert_done() + + def _insert(self, string_or_strings): loop = always_get_an_event_loop() return loop.run_until_complete(self.ainsert(string_or_strings)) - async def ainsert(self, string_or_strings): + async def _ainsert(self, string_or_strings): try: if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] diff --git a/lightrag/operate.py b/lightrag/operate.py index a0729cd81..106c518c8 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1,6 +1,7 @@ import asyncio import json import re +import textwrap from typing import Union from collections import Counter, defaultdict import warnings @@ -25,8 +26,46 @@ ) from .prompt import GRAPH_FIELD_SEP, PROMPTS - def chunking_by_token_size( + content: str, + overlap_token_size=128, + max_token_size=1024, + tiktoken_model="gpt-4o", + filename="!none", + filepath="!none" +): + """Chunk content by token size with metadata prefixed to each chunk.""" + tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) + results = [] + + for index, start in enumerate( + range(0, len(tokens), max_token_size - overlap_token_size) + ): + end = min(start + max_token_size, len(tokens)) + + # Slice the raw content using token indices + chunk_content = content[start:end].strip() + + # Prefix metadata to each chunk + chunk_with_metadata = textwrap.dedent(f""" + #### + ## FILENAME: {filename} + ## FILEPATH: {filepath} + ## CHUNK_NUM: {index} + #### + """).strip() + "\n" + chunk_content.strip() + + + # Store the chunk with its metadata + results.append({ + "content": chunk_with_metadata, + "chunk_order_index": index, + "tokens": end - start + }) + + return results + +def _chunking_by_token_size( content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" ): tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) @@ -78,20 +117,29 @@ async def _handle_single_entity_extraction( record_attributes: list[str], chunk_key: str, ): - if len(record_attributes) < 4 or record_attributes[0] != '"entity"': + if len(record_attributes) < 7 or record_attributes[0] != '"entity"': return None + # add this record as a node in the G - entity_name = clean_str(record_attributes[1].upper()) + entity_name = clean_str(record_attributes[1]) # removed upper if not entity_name.strip(): return None - entity_type = clean_str(record_attributes[2].upper()) + + entity_type = clean_str(record_attributes[2]) entity_description = clean_str(record_attributes[3]) entity_source_id = chunk_key + entity_file = record_attributes[4] + entity_path = record_attributes[5] + entity_chunk = record_attributes[6] + return dict( entity_name=entity_name, entity_type=entity_type, description=entity_description, source_id=entity_source_id, + file=entity_file, + path=entity_path, + chunk=entity_chunk, ) @@ -99,18 +147,23 @@ async def _handle_single_relationship_extraction( record_attributes: list[str], chunk_key: str, ): - if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': - return None + if len(record_attributes) < 9 or record_attributes[0] != '"relationship"': + return None # "content_keywords" should be processed somewhere + # add this record as edge - source = clean_str(record_attributes[1].upper()) - target = clean_str(record_attributes[2].upper()) + source = clean_str(record_attributes[1]) # capitalization disabled + target = clean_str(record_attributes[2]) edge_description = clean_str(record_attributes[3]) edge_keywords = clean_str(record_attributes[4]) edge_source_id = chunk_key weight = ( - float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0 + float(record_attributes[5]) if is_float_regex(record_attributes[5]) else 1.0 ) + edge_file = record_attributes[6] + edge_path = record_attributes[7] + edge_chunk = record_attributes[8] + return dict( src_id=source, tgt_id=target, @@ -118,6 +171,9 @@ async def _handle_single_relationship_extraction( description=edge_description, keywords=edge_keywords, source_id=edge_source_id, + file=edge_file, + path=edge_path, + chunk=edge_chunk, ) @@ -266,7 +322,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): chunk_key = chunk_key_dp[0] chunk_dp = chunk_key_dp[1] content = chunk_dp["content"] - hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) + hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)#.replace('{{', '{').replace('}}', '}') final_result = await use_llm_func(hint_prompt) history = pack_user_ass_to_openai_messages(hint_prompt, final_result) @@ -592,6 +648,9 @@ async def _find_most_related_text_unit_from_entities( all_text_units = sorted( all_text_units, key=lambda x: (x["order"], -x["relation_counts"]) ) + # somhow empty chunks can be here + all_text_units = filter(lambda x: x.get("content"), all_text_units) + all_text_units = truncate_list_by_token_size( all_text_units, key=lambda x: x["data"]["content"], @@ -966,7 +1025,6 @@ async def hybrid_query( def combine_contexts(high_level_context, low_level_context): # Function to extract entities, relationships, and sources from context strings - def extract_sections(context): entities_match = re.search( r"-----Entities-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL @@ -1002,7 +1060,7 @@ def extract_sections(context): else: ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) - # Combine and deduplicate the entities + # Combine and decate the entities combined_entities_set = set( filter(None, hl_entities.strip().split("\n") + ll_entities.strip().split("\n")) ) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index ba2516d89..e704244c1 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -1,3 +1,6 @@ +import os +import toml + GRAPH_FIELD_SEP = "" PROMPTS = {} @@ -7,229 +10,17 @@ PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] -PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] - -PROMPTS["entity_extraction"] = """-Goal- -Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. - --Steps- -1. Identify all entities. For each identified entity, extract the following information: -- entity_name: Name of the entity, capitalized -- entity_type: One of the following types: [{entity_types}] -- entity_description: Comprehensive description of the entity's attributes and activities -Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} - -2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. -For each pair of related entities, extract the following information: -- source_entity: name of the source entity, as identified in step 1 -- target_entity: name of the target entity, as identified in step 1 -- relationship_description: explanation as to why you think the source entity and the target entity are related to each other -- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity -- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details -Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. -Format the content-level key words as ("content_keywords"{tuple_delimiter}) - -4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. - -5. When finished, output {completion_delimiter} - -###################### --Examples- -###################### -Example 1: - -Entity_types: [person, technology, mission, organization, location] -Text: -while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. - -Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” - -The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. - -It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths -################ -Output: -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} -("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} -("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} -("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} -############################# -Example 2: - -Entity_types: [person, technology, mission, organization, location] -Text: -They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. - -Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. - -Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly -############# -Output: -("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} -("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} -("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} -("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} -############################# -Example 3: - -Entity_types: [person, role, technology, organization, event, location, concept] -Text: -their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. - -"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." - -Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." - -Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. - -The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation -############# -Output: -("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} -("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} -("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} -("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} -("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} -("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} -("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} -############################# --Real Data- -###################### -Entity_types: {entity_types} -Text: {input_text} -###################### -Output: -""" - -PROMPTS[ - "summarize_entity_descriptions" -] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. -Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. -Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. -If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. -Make sure it is written in third person, and include the entity names so we the have full context. - -####### --Data- -Entities: {entity_name} -Description List: {description_list} -####### -Output: -""" - -PROMPTS[ - "entiti_continue_extraction" -] = """MANY entities were missed in the last extraction. Add them below using the same format: -""" - -PROMPTS[ - "entiti_if_loop_extraction" -] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. -""" - -PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." - -PROMPTS["rag_response"] = """---Role--- - -You are a helpful assistant responding to questions about data in the tables provided. - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. - ----Target response length and format--- - -{response_type} - ----Data tables--- - -{context_data} - -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" - -PROMPTS["keywords_extraction"] = """---Role--- - -You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. - ----Goal--- - -Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. - ----Instructions--- - -- Output the keywords in JSON format. -- The JSON should have two keys: - - "high_level_keywords" for overarching concepts or themes. - - "low_level_keywords" for specific entities or details. - -###################### --Examples- -###################### -Example 1: - -Query: "How does international trade influence global economic stability?" -################ -Output: -{{ - "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], - "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] -}} -############################# -Example 2: - -Query: "What are the environmental consequences of deforestation on biodiversity?" -################ -Output: -{{ - "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], - "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] -}} -############################# -Example 3: +def load_prompts(toml_file=os.path.join(os.path.dirname(__file__), "prompts", "code.toml")): + """Load prompts from a TOML file and merge them into the existing PROMPTS dictionary.""" + try: + # Load prompts from the TOML file. + toml_data = toml.load(toml_file) -Query: "What is the role of education in reducing poverty?" -################ -Output: -{{ - "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], - "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] -}} -############################# --Real Data- -###################### -Query: {query} -###################### -Output: + # Merge TOML prompts into the existing PROMPTS dictionary. + PROMPTS.update({k: v for k, v in toml_data.items() if v}) -""" + except Exception as e: + print(f"Error loading and merging prompts: {e}") -PROMPTS["naive_rag_response"] = """You're a helpful assistant -Below are the knowledge you know: -{content_data} ---- -If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. ----Target response length and format--- -{response_type} -""" +# Example usage: Load TOML prompts and merge with existing PROMPTS. +load_prompts() \ No newline at end of file diff --git a/lightrag/prompts/code.toml b/lightrag/prompts/code.toml new file mode 100644 index 000000000..46cb0c295 --- /dev/null +++ b/lightrag/prompts/code.toml @@ -0,0 +1,272 @@ +DEFAULT_ENTITY_TYPES = ["function", "class", "module", "file", "variable", "comment", "readme", "test_case", "dependency", "call"] + +entity_extraction = """ +-Goal- +Given a source code document chunk (prefixed with metadata) and a list of entity types, extract all relevant entities and relationships within the chunk. Pay special attention to the code structure (functions, classes, modules) and companion text (comments, README content etc). + +Metadata prefixes are provided in each chunk as follows: +#### +## FILENAME: +## FILEPATH: +## CHUNK_NUM: +#### + +- Use this metadata to link extracted entities to their source location and maintain context across chunks. + +-Steps- +1. Identify All Entities: + For each identified entity, extract the following: + - entity_name: The name of the entity e.g., function name, class name, variable name, etc. + - entity_type: One of the following types: [{entity_types}] + - entity_description: A detailed description of the entity's role, behavior, and attributes (e.g., what a function does, what a class represents). + - file_name: FILENAME field from metadata + - file_path: FILE_PATH field from metadata + - chunk_num: CHUNK_NUM field from metadata + + Format each entity as: + ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. Identify Relationships: + Identify all related pairs of entities. Focus on relationships such as function calls, inheritance, dependencies, and code references. + + For each relationship, extract: + - source_entity: Name of the source entity (e.g., function or class making a call). + - target_entity: Name of the target entity (e.g., function being called or class being inherited). + - relationship_description: Explanation of how the two entities are related. + - relationship_strength: A numeric score (1-10) indicating the relationship's strength (e.g., how central the dependency is). + - relationship_keywords: One or more keywords summarizing the relationship (e.g., "function call", "inheritance", "dependency"). + - file_name: FILENAME field from metadata + - file_path: FILE_PATH field from metadata + - chunk_num: CHUNK_NUM field from metadata + + Format each relationship as: + ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify Companion Documentation Entities: + Identify documentation entities (e.g., README sections, inline comments) that add context to the code. Extract: + - entity_name: Name of the documentation section or comment (e.g., README, comment block). + - entity_type: Either "readme" or "comment". + - entity_description: A summary of the information provided by the documentation or comment. + If there're some code blocks - process them as a code + +4. Summarize High-Level Keywords: + Extract high-level keywords that summarize the key concepts, themes, or operations in the code chunk. Focus on terms such as "data processing", "unit test", or "dependency injection". + + Format the keywords as: + ("content_keywords"{tuple_delimiter}) + +5. Output Format: + Return all entities and relationships as a single list, using {record_delimiter} as the delimiter between records. When done, use {completion_delimiter} to indicate the end of the output. + Output should be precisely structured, because it will be used for automation. + +###################### +-Examples with metadata- +###################### + +*Example 1 (TypeScript Code with Imports): +#### +## FILENAME: example.ts +## FILEPATH: ./src/example.ts +## CHUNK_NUM: 0 +#### +import {{ Component }} from 'react'; +import {{ useState, useEffect }} from 'react'; +import {{ someUtility }} from './utils'; + +function App() {{ + const [state, setState] = useState(0); + useEffect(() => {{ + console.log("Component mounted"); + }}, []); + return ; +}} + +Output: +("entity"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"filepath"{tuple_delimiter}"A TypeScript source file containing a React component."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"App"{tuple_delimiter}"function"{tuple_delimiter}"The main application component function."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"state"{tuple_delimiter}"variable"{tuple_delimiter}"State variable to manage the component state."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"setState"{tuple_delimiter}"function"{tuple_delimiter}"Function to update the state."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"react"{tuple_delimiter}"dependency"{tuple_delimiter}"A library providing Component, useState, and useEffect."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"./utils"{tuple_delimiter}"dependency"{tuple_delimiter}"Utility module imported from './utils'."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"App"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"The 'App' function is defined in this file."{tuple_delimiter}"definition"{tuple_delimiter}8{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"state"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"The state variable is declared in this file."{tuple_delimiter}"declaration"{tuple_delimiter}8{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"react"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"React is imported in this file."{tuple_delimiter}"import"{tuple_delimiter}8{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("content_keywords"{tuple_delimiter}"react, state management, rendering, hooks, imports"){completion_delimiter} + +*Example 2 (Poorly Chunked Rust Code): +#### +## FILENAME: main.rs +## FILEPATH: ./src/main.rs +## CHUNK_NUM: 3 +#### += 2; +// printing n +println!("{{}}", n); +exi + +Output: +("entity"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}"filepath"{tuple_delimiter}"A Rust source file with a poorly chunked snippet."{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("entity"{tuple_delimiter}"n"{tuple_delimiter}"variable"{tuple_delimiter}"A variable being printed."{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("entity"{tuple_delimiter}"println!()"{tuple_delimiter}"call"{tuple_delimiter}"A macro to print a variable."{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("relationship"{tuple_delimiter}"n"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}"The 'n' variable is defined in this file."{tuple_delimiter}"declaration"{tuple_delimiter}7{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("relationship"{tuple_delimiter}"println!()"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}"The 'println!()' macro is used in this file."{tuple_delimiter}"usage"{tuple_delimiter}7{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("content_keywords"{tuple_delimiter}"variable, printing, macro, call"){completion_delimiter} + +*Example 3 (C++ Code Example): +#### +## FILENAME: test.cc +## FILEPATH: ./src/tests/test.cc +## CHUNK_NUM: 1 +#### +int add(int a, int b) {{ + return a + b; +}} + +Output: +("entity"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}"filepath"{tuple_delimiter}"A C++ source file defining the 'add' function."{tuple_delimiter}"test.cc"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}1){record_delimiter} +("entity"{tuple_delimiter}"add"{tuple_delimiter}"function"{tuple_delimiter}"A function adding two integers."{tuple_delimiter}"test.cc"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}1){record_delimiter} +("relationship"{tuple_delimiter}"add"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}"The 'add' function is defined in this file."{tuple_delimiter}"definition"{tuple_delimiter}7{tuple_delimiter}"test.cc"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}1){record_delimiter} +("content_keywords"{tuple_delimiter}"function, parameters, addition"){completion_delimiter} + +*Example 4 (README with Embedded Code): +#### +## FILENAME: README.md +## FILEPATH: ./README.md +## CHUNK_NUM: 0 +#### +# Project Overview +This project implements arithmetic functions, including addition. + +## Usage: +```c +add(1, 2); +``` + +Output: +("entity"{tuple_delimiter}"./README.md"{tuple_delimiter}"filepath"{tuple_delimiter}"A README file describing the project overview and usage."{tuple_delimiter}"README.md"{tuple_delimiter}"./README.md"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"add"{tuple_delimiter}"function"{tuple_delimiter}"An arithmetic function to add two numbers."{tuple_delimiter}"README.md"{tuple_delimiter}"./README.md"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"add"{tuple_delimiter}"./README.md"{tuple_delimiter}"The 'add' function is referenced in this file."{tuple_delimiter}"reference"{tuple_delimiter}6{tuple_delimiter}"README.md"{tuple_delimiter}"./README.md"{tuple_delimiter}0){record_delimiter} +("content_keywords"{tuple_delimiter}"arithmetic, usage, embedded code"){completion_delimiter} +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +summarize_entity_descriptions = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or more entities and their relationships, create a single coherent description that captures the role, interactions, and significance of these entities. + +Make sure to: +1. Include information about all entities and their relationships. +2. Resolve any contradictions between the descriptions. +3. Write in the third person, mentioning each entity by name to maintain full context. +4. Highlight any function calls or dependencies that are important to the relationships. +5. Data will be used for automation, so output raw JSON text without code blocks or formatting. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +entiti_continue_extraction = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +entiti_if_loop_extraction = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +fail_response = "Sorry, I'm not able to provide an answer to that question" + +rag_response = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + +---Data tables--- + +{context_data} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +keywords_extraction = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the provided code-related query or content. + +---Goal--- + +Given the input, list both high-level and low-level keywords. High-level keywords focus on overarching concepts (e.g., architecture, dependencies, or system components). Low-level keywords focus on specific entities, such as functions, variables, modules, and specific calls. + +---Instructions--- + +- Data will be used for automation, so output raw JSON text without code blocks or formatting. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or concrete terms. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does the App component manage state using React hooks and render child components?" +################ +Output: +{{ + "high_level_keywords": ["State management", "React hooks", "Component rendering"], + "low_level_keywords": ["App", "useState", "useEffect", "Component"] +}} +############################# +Example 2: + +Query: "What are the roles of main and add functions in a simple C++ program?" +################ +Output: +{{ + "high_level_keywords": ["Function roles", "C++ program structure"], + "low_level_keywords": ["main", "add", "int", "return"] +}} +############################# +Example 3: + +Query: "Analyze how a Rust program prints variables using println macro." +################ +Output: +{{ + "high_level_keywords": ["Rust program", "Printing variables", "Macros"], + "low_level_keywords": ["println!", "variable", "format string"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: +""" + +naive_rag_response = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/prompts/default.toml b/lightrag/prompts/default.toml new file mode 100644 index 000000000..874388e24 --- /dev/null +++ b/lightrag/prompts/default.toml @@ -0,0 +1,228 @@ +DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event", 'filepath'] + +entity_extraction = """-Goal- +Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. + +-Steps- +1. Identify all entities. File name should be entity also. For each identified entity, extract the following information: +- entity_name: Name of the entity, capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_description: Comprehensive description of the entity's attributes and activities +- file_name: FILENAME field from metadata +- file_path: FILE_PATH field from metadata +- chunk_num: CHUNK_NUM field from metadata + +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity +- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details +- file_name: FILENAME field from metadata +- file_path: FILE_PATH field from metadata +- chunk_num: CHUNK_NUM field from metadata + +Format each relationship as +("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +###################### +-Examples- +###################### + +*Example 1 (Entity Interactions in Text): +#### +## FILENAME: fragment_1.txt +## FILEPATH: ./src/fragment_1.txt +## CHUNK_NUM: 0 +#### +while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. + +Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” + +The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. + +It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths +################ +Output: +("entity"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}"filepath"{tuple_delimiter}"A text file capturing character dynamics and their shared commitment to discovery."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"filename"{tuple_delimiter}"A file focused on interpersonal dynamics among several individuals."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"A character who experiences frustration and observes the dynamics among others."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty, showing a change of perspective."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}"Alex appears in this file."{tuple_delimiter}"appearance"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor's attitude influences Alex's observation."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}8){record_delimiter} +("content_keywords"{tuple_delimiter}"power dynamics, discovery, change of perspective"){completion_delimiter} + +--- + +*Example 2 (Mission and Evolution): +#### +## FILENAME: mission_evolution.txt +## FILEPATH: ./docs/mission_evolution.txt +## CHUNK_NUM: 0 +#### +They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations—it demanded new resolve. + +Tension threaded through the dialogue of beeps as communications with Washington buzzed in the background. The team stood enveloped in a portentous air. Their decisions could redefine humanity's place in the cosmos or condemn it to ignorance. + +The group moved from passive recipients to active participants. Mercer's instincts led the team to shift from observing to interacting, marking a metamorphosis within Operation: Dulce. +############# +Output: +("entity"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}"filepath"{tuple_delimiter}"A document describing a mission's evolution."{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"filename"{tuple_delimiter}"A file focused on the transformation of a mission."{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a key communication hub in the narrative."{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"Washington"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}"Washington is referenced in this file."{tuple_delimiter}"reference"{tuple_delimiter}6){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington."{tuple_delimiter}"decision-making, influence"{tuple_delimiter}7){record_delimiter} +("content_keywords"{tuple_delimiter}"mission evolution, decision-making, cosmic impact"){completion_delimiter} + +--- + +*Example 3 (Exploration of Control and Communication): +#### +## FILENAME: control_and_communication.txt +## FILEPATH: ./docs/control_and_communication.txt +## CHUNK_NUM: 0 +#### +their voice slicing through the buzz of activity. "Control may be an illusion when facing intelligence that writes its own rules," they stated, casting a watchful eye over the flurry of data. + +"It's like it's learning to communicate," said Sam Rivera from a nearby interface, with a mix of awe and anxiety. + +Alex surveyed his team, acknowledging that this might well be their first contact. "We need to be ready for whatever answers back," he added. + +Together, they forged humanity's response to a cosmic message, a moment that could rewrite human history. +############# +Output: +("entity"{tuple_delimiter}"./docs/control_and_communication.txt"{tuple_delimiter}"filepath"{tuple_delimiter}"A file exploring the themes of control and communication."{tuple_delimiter}"control_and_communication.txt"{tuple_delimiter}"./docs/control_and_communication.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"A participant observing intelligence and its communication."{tuple_delimiter}"control_and_communication.txt"{tuple_delimiter}"./docs/control_and_communication.txt"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera interacts with the intelligent entity."{tuple_delimiter}"communication, observation"{tuple_delimiter}9){record_delimiter} +("content_keywords"{tuple_delimiter}"control, communication, intelligence, exploration"){completion_delimiter} + +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +summarize_entity_descriptions = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +entiti_continue_extraction = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +entiti_if_loop_extraction = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +fail_response = "Sorry, I'm not able to provide an answer to that question" + +rag_response = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + +---Data tables--- + +{context_data} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +keywords_extraction = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. + +---Goal--- + +Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. + +---Instructions--- + +- Output the keywords in JSON format. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or details. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does international trade influence global economic stability?" +################ +Output: +{{ + "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], + "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] +}} +############################# +Example 2: + +Query: "What are the environmental consequences of deforestation on biodiversity?" +################ +Output: +{{ + "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], + "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] +}} +############################# +Example 3: + +Query: "What is the role of education in reducing poverty?" +################ +Output: +{{ + "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], + "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: + +""" + +naive_rag_response = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/storage.py b/lightrag/storage.py index 1f22fc565..6fb7b3b68 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -110,6 +110,10 @@ async def query(self, query: str, top_k=5): {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results ] return results + + def dump(self): + for record in self._client.__storage["data"]: + print(record) async def index_done_callback(self): self._client.save() @@ -140,7 +144,7 @@ def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: graph = graph.copy() graph = cast(nx.Graph, largest_connected_component(graph)) node_mapping = { - node: html.unescape(node.upper().strip()) for node in graph.nodes() + node: html.unescape(node.strip()) for node in graph.nodes() # removed upper } # type: ignore graph = nx.relabel_nodes(graph, node_mapping) return NetworkXStorage._stabilize_graph(graph) diff --git a/lightrag/utils.py b/lightrag/utils.py index 67d094c62..0b26a2985 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -4,6 +4,7 @@ import logging import os import re +import shutil from dataclasses import dataclass from functools import wraps from hashlib import md5 @@ -18,6 +19,12 @@ def set_logger(log_file: str): + log_dir = os.path.dirname(log_file) # Extract directory from log file path + + if not os.path.exists(log_dir): + print(f"Directory '{log_dir}' does not exist. Creating it...") + os.makedirs(log_dir, exist_ok=True) # Create the director + logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler(log_file) @@ -30,6 +37,9 @@ def set_logger(log_file: str): if not logger.handlers: logger.addHandler(file_handler) + + # disable logging + logging.disable(logging.CRITICAL + 1) @dataclass