Skip to content

Commit

Permalink
Merge pull request #15 from severian42/dev
Browse files Browse the repository at this point in the history
Enhance indexing and querying functionalities
  • Loading branch information
severian42 authored Jul 16, 2024
2 parents df06e5a + 42f5cec commit 595abb7
Show file tree
Hide file tree
Showing 98 changed files with 261 additions and 186 deletions.
Binary file modified .DS_Store
Binary file not shown.
142 changes: 99 additions & 43 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@
from ollama import chat
import pyarrow.parquet as pq
import pandas as pd
import sys

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)

import gradio as gr
from graphrag.query import cli

# Set up logging
log_queue = queue.Queue()
Expand Down Expand Up @@ -80,14 +87,8 @@ def create_setting_component(key, value):
outputs=[status]
)

def run_command(command):
try:
result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
return result.stdout
except subprocess.CalledProcessError as e:
return f"Error: {e.stderr}"

def index_graph(root_dir, progress=gr.Progress()):
def index_graph(progress=gr.Progress()):
root_dir = "./ragtest"
command = f"python -m graphrag.index --root {root_dir}"
logging.info(f"Running indexing command: {command}")

Expand Down Expand Up @@ -129,10 +130,28 @@ def run_command_with_output():
logging.info("Indexing completed")
return "\n".join(full_output), update_logs()

def run_query(root_dir, method, query, history):
command = f"python -m graphrag.query --root {root_dir} --method {method} \"{query}\""
result = run_command(command)
return result
def run_query(root_dir, method, query, history, model, temperature, max_tokens):
system_message = f"You are a helpful assistant performing a {method} search on the knowledge graph. Provide a concise and relevant answer based on the query."
messages = [{"role": "system", "content": system_message}]
for item in history:
if isinstance(item, tuple) and len(item) == 2:
human, ai = item
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
messages.append({"role": "user", "content": query})

try:
response = chat(
model=model,
messages=messages,
options={
"temperature": temperature,
"num_predict": max_tokens
}
)
return response['message']['content']
except Exception as e:
return f"Error: {str(e)}"

def upload_file(file):
if file is not None:
Expand Down Expand Up @@ -230,17 +249,28 @@ def manage_data():
"input_files": input_files
}


def find_latest_graph_file(root_dir):
pattern = os.path.join(root_dir, "output", "*", "artifacts", "*.graphml")
graph_files = glob.glob(pattern)
if not graph_files:
# If no files found, try excluding .DS_Store
output_dir = os.path.join(root_dir, "output")
run_dirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d)) and d != ".DS_Store"]
if run_dirs:
latest_run = max(run_dirs)
pattern = os.path.join(root_dir, "output", latest_run, "artifacts", "*.graphml")
graph_files = glob.glob(pattern)

if not graph_files:
return None

# Sort files by modification time, most recent first
latest_file = max(graph_files, key=os.path.getmtime)
return latest_file

def update_visualization(root_dir, folder_name, file_name):
def update_visualization(folder_name, file_name):
root_dir = "./ragtest"
if not folder_name or not file_name:
return None, "Please select a folder and a GraphML file."
file_name = file_name.split("] ")[1] if "]" in file_name else file_name # Remove file type prefix
Expand Down Expand Up @@ -345,9 +375,13 @@ def update_logs():

def chat_with_llm(message, history, system_message, temperature, max_tokens, model):
messages = [{"role": "system", "content": system_message}]
for human, ai in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
for item in history:
if isinstance(item, tuple) and len(item) == 2:
human, ai = item
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
elif isinstance(item, str):
messages.append({"role": "user", "content": item})
messages.append({"role": "user", "content": message})

try:
Expand All @@ -363,16 +397,17 @@ def chat_with_llm(message, history, system_message, temperature, max_tokens, mod
except Exception as e:
return f"Error: {str(e)}"

def send_message(root_dir, query_type, query, history, system_message, temperature, max_tokens, model):
if query_type == "global":
result = run_query(root_dir, "global", query, history)
history.append((query, result))
elif query_type == "local":
result = run_query(root_dir, "local", query, history)
history.append((query, result))
else: # Direct chat
result = chat_with_llm(query, history, system_message, temperature, max_tokens, model)
def send_message(query_type, query, history, system_message, temperature, max_tokens, model):
root_dir = "./ragtest"
try:
if query_type in ["global", "local"]:
result = run_query(root_dir, query_type, query, history, model, temperature, max_tokens)
else: # Direct chat
result = chat_with_llm(query, history, system_message, temperature, max_tokens, model)
history.append((query, result))
except Exception as e:
error_message = f"An error occurred: {str(e)}"
history.append((query, error_message))
return history, gr.update(value=""), update_logs()

def fetch_ollama_models():
Expand Down Expand Up @@ -659,16 +694,19 @@ def update_file_content(file_path):
return f"Error reading file: {str(e)}"

def update_output_folder_list():
folders = list_output_folders(root_dir.value)
root_dir = "./ragtest"
folders = list_output_folders(root_dir)
return gr.update(choices=folders, value=folders[0] if folders else None)

def update_folder_content_list(root_dir, folder_name):
def update_folder_content_list(folder_name):
root_dir = "./ragtest"
if not folder_name:
return gr.update(choices=[])
contents = list_folder_contents(os.path.join(root_dir, "output", folder_name))
return gr.update(choices=contents)

def handle_content_selection(root_dir, folder_name, selected_item):
def handle_content_selection(folder_name, selected_item):
root_dir = "./ragtest"
if isinstance(selected_item, list) and selected_item:
selected_item = selected_item[0] # Take the first item if it's a list

Expand All @@ -687,7 +725,8 @@ def handle_content_selection(root_dir, folder_name, selected_item):
else:
return gr.update(), "", ""

def initialize_selected_folder(root_dir, folder_name):
def initialize_selected_folder(folder_name):
root_dir = "./ragtest"
if not folder_name:
return "Please select a folder first.", gr.update(choices=[])
folder_path = os.path.join(root_dir, "output", folder_name, "artifacts")
Expand Down Expand Up @@ -740,11 +779,10 @@ def list_folder_contents(folder_path):
operation_status = gr.Textbox(label="Operation Status", visible=False)


with gr.Accordion("Indexing", open=True):
root_dir = gr.Textbox(label="Root Directory", value=os.path.abspath("./ragtest"))
index_btn = gr.Button("Run Indexing", variant="primary")
index_output = gr.Textbox(label="Indexing Output", lines=10, visible=True)
index_progress = gr.Textbox(label="Indexing Progress", visible=True)
with gr.Accordion("Indexing", open=False):
index_btn = gr.Button("Run Indexing", variant="primary")
index_output = gr.Textbox(label="Indexing Output", lines=10, visible=True)
index_progress = gr.Textbox(label="Indexing Progress", visible=True)

with gr.TabItem("Indexing Outputs"):
output_folder_list = gr.Dropdown(label="Select Output Folder", choices=[], interactive=True)
Expand Down Expand Up @@ -805,38 +843,53 @@ def list_folder_contents(folder_path):
save_btn.click(fn=save_file_content, inputs=[file_list, file_content], outputs=[operation_status, log_output])
index_btn.click(
fn=index_graph,
inputs=[root_dir],
outputs=[index_output, log_output],
show_progress=True
)
refresh_folder_btn.click(fn=update_output_folder_list, outputs=[output_folder_list]).then(
fn=update_logs,
outputs=[log_output]
)
output_folder_list.change(fn=update_folder_content_list, inputs=[root_dir, output_folder_list], outputs=[folder_content_list]).then(
output_folder_list.change(
fn=update_folder_content_list,
inputs=[output_folder_list],
outputs=[folder_content_list]
).then(
fn=update_logs,
outputs=[log_output]
)
folder_content_list.change(fn=handle_content_selection, inputs=[root_dir, output_folder_list, folder_content_list], outputs=[folder_content_list, file_info, output_content]).then(
folder_content_list.change(
fn=handle_content_selection,
inputs=[output_folder_list, folder_content_list],
outputs=[folder_content_list, file_info, output_content]
).then(
fn=update_logs,
outputs=[log_output]
)
initialize_folder_btn.click(fn=initialize_selected_folder, inputs=[root_dir, output_folder_list], outputs=[initialization_status, folder_content_list]).then(
initialize_folder_btn.click(
fn=initialize_selected_folder,
inputs=[output_folder_list],
outputs=[initialization_status, folder_content_list]
).then(
fn=update_logs,
outputs=[log_output]
)
vis_btn.click(fn=update_visualization, inputs=[root_dir, output_folder_list, folder_content_list], outputs=[vis_output, vis_status]).then(
vis_btn.click(
fn=update_visualization,
inputs=[output_folder_list, folder_content_list],
outputs=[vis_output, vis_status]
).then(
fn=update_logs,
outputs=[log_output]
)
query_btn.click(
fn=send_message,
inputs=[root_dir, query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
inputs=[query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
outputs=[chatbot, query_input, log_output]
)
query_input.submit(
fn=send_message,
inputs=[root_dir, query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
inputs=[query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
outputs=[chatbot, query_input, log_output]
)
refresh_models_btn.click(
Expand Down Expand Up @@ -866,6 +919,9 @@ def list_folder_contents(folder_path):
document.addEventListener('DOMContentLoaded', addShiftEnterListener);
""")


demo = demo.queue()


if __name__ == "__main__":
demo.queue()
demo.launch()
demo.launch(share=True, reload=False)
Binary file modified graphrag/.DS_Store
Binary file not shown.
Binary file modified graphrag/index/.DS_Store
Binary file not shown.
Binary file added graphrag/index/graph/.DS_Store
Binary file not shown.
Binary file modified graphrag/index/graph/utils/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified graphrag/index/utils/__pycache__/rate_limiter.cpython-310.pyc
Binary file not shown.
Binary file added graphrag/index/verbs/.DS_Store
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified graphrag/query/.DS_Store
Binary file not shown.
Binary file modified graphrag/query/__pycache__/cli.cpython-310.pyc
Binary file not shown.
51 changes: 43 additions & 8 deletions graphrag/query/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
"""Command line interface for the query module."""

import os
import glob
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from pathlib import Path
from typing import cast

Expand Down Expand Up @@ -59,19 +66,47 @@ def run_global_search(
community_level: int,
response_type: str,
query: str,
config: GraphRagConfig,
):
"""Run a global search with the given query."""
data_dir, root_dir, config = _configure_paths_and_settings(data_dir, root_dir)
data_path = Path(data_dir)
logger.info(f"Starting global search with query: {query}")

root_dir = config.root_dir
if not root_dir:
raise ValueError("Root directory is not defined in the configuration")

output_dir = os.path.join(root_dir, "output")
logger.info(f"Output directory: {output_dir}")

if not os.path.exists(output_dir):
raise FileNotFoundError(f"Output directory does not exist: {output_dir}")

# Find the latest run directory
run_dirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d))]
if not run_dirs:
raise FileNotFoundError(f"No run directories found in {output_dir}")

latest_run = max(run_dirs) # Assumes directory names are sortable (e.g., timestamps)
logger.info(f"Latest run directory: {latest_run}")

parquet_path = os.path.join(output_dir, latest_run, "artifacts", "create_final_nodes.parquet")
logger.info(f"Looking for parquet file: {parquet_path}")

if not os.path.exists(parquet_path):
raise FileNotFoundError(f"Parquet file not found: {parquet_path}")

try:
final_nodes: pd.DataFrame = pd.read_parquet(parquet_path)
logger.info(f"Successfully read parquet file with {len(final_nodes)} rows")
except Exception as e:
logger.error(f"Error reading parquet file {parquet_path}: {str(e)}")
raise IOError(f"Error reading parquet file {parquet_path}: {str(e)}")

final_nodes: pd.DataFrame = pd.read_parquet(
data_path / "create_final_nodes.parquet"
)
final_entities: pd.DataFrame = pd.read_parquet(
data_path / "create_final_entities.parquet"
data_dir / "create_final_entities.parquet"
)
final_community_reports: pd.DataFrame = pd.read_parquet(
data_path / "create_final_community_reports.parquet"
data_dir / "create_final_community_reports.parquet"
)

reports = read_indexer_reports(
Expand Down Expand Up @@ -209,4 +244,4 @@ def _read_config_parameters(root: str):
return create_graphrag_config(data, root)

reporter.info("Reading settings from environment variables")
return create_graphrag_config(root_dir=root)
return create_graphrag_config(root_dir=root)
Binary file modified lancedb/.DS_Store
Binary file not shown.
Binary file modified lancedb/description_embedding.lance/.DS_Store
Binary file not shown.
Binary file modified lancedb/description_embedding.lance/_latest.manifest
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

$3f2722e9-ac06-467a-b742-3be376b857e2��id ���������*string08text ���������*string084vector ���������*fixed_size_list:float:76808%
attributes ���������*string08
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
 $172bd179-ce47-424f-8d9d-8a86eb9f2bad��id ���������*string08text ���������*string084vector ���������*fixed_size_list:float:76808%
attributes ���������*string08
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
$28145788-7dd4-4627-a7e7-eaf74dba331b��id ���������*string08text ���������*string084vector ���������*fixed_size_list:float:76808%
attributes ���������*string08
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
$da1ac31f-589a-4264-a164-214dea8acd39��id ���������*string08text ���������*string084vector ���������*fixed_size_list:float:76808%
attributes ���������*string08
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified ragtest/.DS_Store
Binary file not shown.
Binary file modified ragtest/cache/.DS_Store
Binary file not shown.
Loading

0 comments on commit 595abb7

Please sign in to comment.