Skip to content

Commit

Permalink
feat: add sub-community report
Browse files Browse the repository at this point in the history
  • Loading branch information
gusye1234 committed Aug 23, 2024
1 parent f22ca3b commit f4811e5
Show file tree
Hide file tree
Showing 5 changed files with 409 additions and 16 deletions.
98 changes: 86 additions & 12 deletions nano_graphrag/_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,49 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
return knwoledge_graph_inst


def _pack_single_community_by_sub_communities(
community: SingleCommunitySchema,
max_token_size: int,
already_reports: dict[str, CommunitySchema],
) -> tuple[str, int]:
# TODO
all_sub_communities = [
already_reports[k] for k in community["sub_communities"] if k in already_reports
]
all_sub_communities = sorted(
all_sub_communities, key=lambda x: x["occurrence"], reverse=True
)
may_trun_all_sub_communities = truncate_list_by_token_size(
all_sub_communities,
key=lambda x: x["report_string"],
max_token_size=max_token_size,
)
sub_fields = ["id", "report", "rating", "importance"]
sub_communities_describe = list_of_list_to_csv(
[sub_fields]
+ [
[
i,
c["report_string"],
c["report_json"].get("rating", -1),
c["occurrence"],
]
for i, c in enumerate(may_trun_all_sub_communities)
]
)
already_nodes = []
already_edges = []
for c in may_trun_all_sub_communities:
already_nodes.extend(c["nodes"])
already_edges.extend([tuple(e) for e in c["edges"]])
return (
sub_communities_describe,
len(encode_string_by_tiktoken(sub_communities_describe)),
set(already_nodes),
set(already_edges),
)


async def _pack_single_community_describe(
knwoledge_graph_inst: BaseGraphStorage,
community: SingleCommunitySchema,
Expand Down Expand Up @@ -386,22 +429,52 @@ async def _pack_single_community_describe(
edges_may_truncate_list_data = truncate_list_by_token_size(
edges_list_data, key=lambda x: x[3], max_token_size=max_token_size // 2
)
if len(nodes_list_data) > len(nodes_may_truncate_list_data) or len(

truncated = len(nodes_list_data) > len(nodes_may_truncate_list_data) or len(
edges_list_data
) > len(edges_may_truncate_list_data):
# If context is exceed the limit:
if not len(community["sub_communities"]):
pass
elif not len(already_reports):
logger.warning(
"unknown error for community reports, maybe the storage is damaged"
) > len(edges_may_truncate_list_data)

# If context is exceed the limit and have sub-communities:
report_describe = ""
if truncated and len(community["sub_communities"]) and len(already_reports):
logger.info(
f"Community {community['title']} exceeds the limit, using its sub-communities"
)
report_describe, report_size, contain_nodes, contain_edges = (
_pack_single_community_by_sub_communities(
community, max_token_size, already_reports
)
else:
pass
)
report_exclude_nodes_list_data = [
n for n in nodes_list_data if n[1] not in contain_nodes
]
report_include_nodes_list_data = [
n for n in nodes_list_data if n[1] in contain_nodes
]
report_exclude_edges_list_data = [
e for e in edges_list_data if (e[1], e[2]) not in contain_edges
]
report_include_edges_list_data = [
e for e in edges_list_data if (e[1], e[2]) in contain_edges
]
# if report size is bigger than max_token_size, nodes and edges are []
nodes_may_truncate_list_data = truncate_list_by_token_size(
report_exclude_nodes_list_data + report_include_nodes_list_data,
key=lambda x: x[3],
max_token_size=(max_token_size - report_size) // 2,
)
edges_may_truncate_list_data = truncate_list_by_token_size(
report_exclude_edges_list_data + report_include_edges_list_data,
key=lambda x: x[3],
max_token_size=(max_token_size - report_size) // 2,
)
nodes_describe = list_of_list_to_csv([node_fields] + nodes_may_truncate_list_data)
edges_describe = list_of_list_to_csv([edge_fields] + edges_may_truncate_list_data)

return f"""-----Entities-----
return f"""-----Reports-----
```csv
{report_describe}
```
-----Entities-----
```csv
{nodes_describe}
```
Expand Down Expand Up @@ -456,6 +529,7 @@ async def _form_single_community_report(
knwoledge_graph_inst,
community,
max_token_size=global_config["best_model_max_token_size"],
already_reports=already_reports,
)
prompt = community_report_prompt.format(input_text=describe)
response = await use_llm_func(prompt, **llm_extra_kwargs)
Expand Down
4 changes: 3 additions & 1 deletion nano_graphrag/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"):

def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int):
"""Truncate a list of data by token size"""
if max_token_size <= 0:
return []
tokens = 0
for i, data in enumerate(list_data):
tokens += len(encode_string_by_tiktoken(key(data)))
Expand Down Expand Up @@ -114,7 +116,7 @@ async def __call__(self, *args, **kwargs) -> np.ndarray:


# Decorators ------------------------------------------------------------------------
def limit_async_func_call(max_size: int, waitting_time: float = 0.001):
def limit_async_func_call(max_size: int, waitting_time: float = 0.0001):
"""Add restriction of maximum async calling times for a async func"""

def final_decro(func):
Expand Down
4 changes: 2 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,10 @@ You can refer to `nano_graphrag.base` to see detailed interfaces for each compon

## TODO in Next Version

> If a checkbox is filled meaning someone is on it
> If a checkbox is filled meaning it's done.
- [ ] `nano-graphrag`'s Data Source Id is local, meaning it always starts at 0 at any response and you have to remap it into the current session. So it's kinda useless right now.
- [ ] `nano-graphrag` truncates the community's raw description if it exceed the maximun context size when generating community report, while GraphRAG uses a sub-communities iterative summary to include all.
- [x] `nano-graphrag` truncates the community's raw description if it exceed the maximun context size when generating community report, while GraphRAG uses a sub-communities iterative summary to include all.
- [ ] Add real benchmark with GraphRAG
- [ ] Add new components, see [issue](https://github.com/gusye1234/nano-graphrag/issues/2)

3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
sentence_transformers
flake8
pytest
pytest
future
Loading

0 comments on commit f4811e5

Please sign in to comment.