Skip to content

Commit

Permalink
Enhance indexing and querying functionalities
Browse files Browse the repository at this point in the history
- Updated `index_graph` function to include detailed progress tracking and multi-threaded output handling.
- Improved `run_query` and `send_message` functions to handle model parameters and exceptions robustly.
- Refactored UI components for better user interaction:
  - Removed redundant `root_dir` parameter from several functions.
  - Enhanced folder and file selection interactions in the indexing outputs tab.
  - Improved feedback and status updates for file operations and indexing progress.
- Added progress bar and detailed logging for indexing process.
- Set up global `root_dir` value for consistent directory handling.
- Minor bug fixes and performance optimizations.

Changes:
- Modified `index_graph` to provide real-time progress updates.
- Refactored `send_message` to streamline query execution and error handling.
- Improved `update_visualization`, `update_output_folder_list`, `update_folder_content_list`, `handle_content_selection`, and `initialize_selected_folder` for better user experience.
- Updated UI elements in the GraphRAG Local UI for clarity and usability.

Reviewed and tested all changes to ensure stability and performance improvements.
  • Loading branch information
severian42 committed Jul 16, 2024
1 parent 9e8039a commit 4484c3e
Show file tree
Hide file tree
Showing 98 changed files with 305 additions and 188 deletions.
Binary file modified .DS_Store
Binary file not shown.
188 changes: 143 additions & 45 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@
from ollama import chat
import pyarrow.parquet as pq
import pandas as pd
import sys

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)

import gradio as gr
from graphrag.query import cli

# Set up logging
log_queue = queue.Queue()
Expand Down Expand Up @@ -80,24 +87,71 @@ def create_setting_component(key, value):
outputs=[status]
)

def run_command(command):
try:
result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
return result.stdout
except subprocess.CalledProcessError as e:
return f"Error: {e.stderr}"

def index_graph(root_dir):
def index_graph(progress=gr.Progress()):
root_dir = "./ragtest"
command = f"python -m graphrag.index --root {root_dir}"
logging.info(f"Running indexing command: {command}")
result = run_command(command)

# Create a queue to store the output
output_queue = queue.Queue()

def run_command_with_output():
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
for line in iter(process.stdout.readline, ''):
output_queue.put(line)
process.stdout.close()
process.wait()

# Start the command in a separate thread
thread = threading.Thread(target=run_command_with_output)
thread.start()

# Initialize progress
progress(0, desc="Starting indexing...")

# Process the output and update progress
full_output = []
while thread.is_alive() or not output_queue.empty():
try:
line = output_queue.get_nowait()
full_output.append(line)

# Update progress based on the output
if "Processing file" in line:
progress((0.5, None), desc="Processing files...")
elif "Indexing completed" in line:
progress(1, desc="Indexing completed")

yield "\n".join(full_output), update_logs()
except queue.Empty:
time.sleep(0.1)

thread.join()
logging.info("Indexing completed")
return result, update_logs()
return "\n".join(full_output), update_logs()

def run_query(root_dir, method, query, history):
command = f"python -m graphrag.query --root {root_dir} --method {method} \"{query}\""
result = run_command(command)
return result
def run_query(root_dir, method, query, history, model, temperature, max_tokens):
system_message = f"You are a helpful assistant performing a {method} search on the knowledge graph. Provide a concise and relevant answer based on the query."
messages = [{"role": "system", "content": system_message}]
for item in history:
if isinstance(item, tuple) and len(item) == 2:
human, ai = item
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
messages.append({"role": "user", "content": query})

try:
response = chat(
model=model,
messages=messages,
options={
"temperature": temperature,
"num_predict": max_tokens
}
)
return response['message']['content']
except Exception as e:
return f"Error: {str(e)}"

def upload_file(file):
if file is not None:
Expand Down Expand Up @@ -195,17 +249,28 @@ def manage_data():
"input_files": input_files
}


def find_latest_graph_file(root_dir):
pattern = os.path.join(root_dir, "output", "*", "artifacts", "*.graphml")
graph_files = glob.glob(pattern)
if not graph_files:
# If no files found, try excluding .DS_Store
output_dir = os.path.join(root_dir, "output")
run_dirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d)) and d != ".DS_Store"]
if run_dirs:
latest_run = max(run_dirs)
pattern = os.path.join(root_dir, "output", latest_run, "artifacts", "*.graphml")
graph_files = glob.glob(pattern)

if not graph_files:
return None

# Sort files by modification time, most recent first
latest_file = max(graph_files, key=os.path.getmtime)
return latest_file

def update_visualization(root_dir, folder_name, file_name):
def update_visualization(folder_name, file_name):
root_dir = "./ragtest"
if not folder_name or not file_name:
return None, "Please select a folder and a GraphML file."
file_name = file_name.split("] ")[1] if "]" in file_name else file_name # Remove file type prefix
Expand Down Expand Up @@ -310,9 +375,13 @@ def update_logs():

def chat_with_llm(message, history, system_message, temperature, max_tokens, model):
messages = [{"role": "system", "content": system_message}]
for human, ai in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
for item in history:
if isinstance(item, tuple) and len(item) == 2:
human, ai = item
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
elif isinstance(item, str):
messages.append({"role": "user", "content": item})
messages.append({"role": "user", "content": message})

try:
Expand All @@ -328,16 +397,17 @@ def chat_with_llm(message, history, system_message, temperature, max_tokens, mod
except Exception as e:
return f"Error: {str(e)}"

def send_message(root_dir, query_type, query, history, system_message, temperature, max_tokens, model):
if query_type == "global":
result = run_query(root_dir, "global", query, history)
history.append((query, result))
elif query_type == "local":
result = run_query(root_dir, "local", query, history)
history.append((query, result))
else: # Direct chat
result = chat_with_llm(query, history, system_message, temperature, max_tokens, model)
def send_message(query_type, query, history, system_message, temperature, max_tokens, model):
root_dir = "./ragtest"
try:
if query_type in ["global", "local"]:
result = run_query(root_dir, query_type, query, history, model, temperature, max_tokens)
else: # Direct chat
result = chat_with_llm(query, history, system_message, temperature, max_tokens, model)
history.append((query, result))
except Exception as e:
error_message = f"An error occurred: {str(e)}"
history.append((query, error_message))
return history, gr.update(value=""), update_logs()

def fetch_ollama_models():
Expand Down Expand Up @@ -624,16 +694,19 @@ def update_file_content(file_path):
return f"Error reading file: {str(e)}"

def update_output_folder_list():
folders = list_output_folders(root_dir.value)
root_dir = "./ragtest"
folders = list_output_folders(root_dir)
return gr.update(choices=folders, value=folders[0] if folders else None)

def update_folder_content_list(root_dir, folder_name):
def update_folder_content_list(folder_name):
root_dir = "./ragtest"
if not folder_name:
return gr.update(choices=[])
contents = list_folder_contents(os.path.join(root_dir, "output", folder_name))
return gr.update(choices=contents)

def handle_content_selection(root_dir, folder_name, selected_item):
def handle_content_selection(folder_name, selected_item):
root_dir = "./ragtest"
if isinstance(selected_item, list) and selected_item:
selected_item = selected_item[0] # Take the first item if it's a list

Expand All @@ -652,7 +725,8 @@ def handle_content_selection(root_dir, folder_name, selected_item):
else:
return gr.update(), "", ""

def initialize_selected_folder(root_dir, folder_name):
def initialize_selected_folder(folder_name):
root_dir = "./ragtest"
if not folder_name:
return "Please select a folder first.", gr.update(choices=[])
folder_path = os.path.join(root_dir, "output", folder_name, "artifacts")
Expand Down Expand Up @@ -681,13 +755,13 @@ def list_folder_contents(folder_path):
default_model = settings['llm']['model']

with gr.Blocks(css=custom_css, theme=gr.themes.Base()) as demo:
gr.Markdown("# GraphRAG UI", elem_id="title")
gr.Markdown("# GraphRAG Local UI", elem_id="title")

with gr.Row(elem_id="main-container"):
with gr.Column(scale=1, elem_id="left-column"):
with gr.Tabs():
with gr.TabItem("Data Management"):
with gr.Accordion("File Operations", open=False):
with gr.Accordion("File Upload (.txt)", open=True):
file_upload = gr.File(label="Upload .txt File", file_types=[".txt"])
upload_btn = gr.Button("Upload File", variant="primary")
upload_output = gr.Textbox(label="Upload Status", visible=False)
Expand All @@ -705,10 +779,10 @@ def list_folder_contents(folder_path):
operation_status = gr.Textbox(label="Operation Status", visible=False)


with gr.Accordion("Indexing", open=False):
root_dir = gr.Textbox(label="Root Directory", value=os.path.abspath("./ragtest"))
index_btn = gr.Button("Run Indexing", variant="primary")
index_output = gr.Textbox(label="Indexing Output", lines=5, visible=False)
with gr.Accordion("Indexing", open=False):
index_btn = gr.Button("Run Indexing", variant="primary")
index_output = gr.Textbox(label="Indexing Output", lines=10, visible=True)
index_progress = gr.Textbox(label="Indexing Progress", visible=True)

with gr.TabItem("Indexing Outputs"):
output_folder_list = gr.Dropdown(label="Select Output Folder", choices=[], interactive=True)
Expand Down Expand Up @@ -767,35 +841,55 @@ def list_folder_contents(folder_path):
)
delete_btn.click(fn=delete_file, inputs=[file_list], outputs=[operation_status, file_list, log_output])
save_btn.click(fn=save_file_content, inputs=[file_list, file_content], outputs=[operation_status, log_output])
index_btn.click(fn=index_graph, inputs=[root_dir], outputs=[index_output, log_output])
index_btn.click(
fn=index_graph,
outputs=[index_output, log_output],
show_progress=True
)
refresh_folder_btn.click(fn=update_output_folder_list, outputs=[output_folder_list]).then(
fn=update_logs,
outputs=[log_output]
)
output_folder_list.change(fn=update_folder_content_list, inputs=[root_dir, output_folder_list], outputs=[folder_content_list]).then(
output_folder_list.change(
fn=update_folder_content_list,
inputs=[output_folder_list],
outputs=[folder_content_list]
).then(
fn=update_logs,
outputs=[log_output]
)
folder_content_list.change(fn=handle_content_selection, inputs=[root_dir, output_folder_list, folder_content_list], outputs=[folder_content_list, file_info, output_content]).then(
folder_content_list.change(
fn=handle_content_selection,
inputs=[output_folder_list, folder_content_list],
outputs=[folder_content_list, file_info, output_content]
).then(
fn=update_logs,
outputs=[log_output]
)
initialize_folder_btn.click(fn=initialize_selected_folder, inputs=[root_dir, output_folder_list], outputs=[initialization_status, folder_content_list]).then(
initialize_folder_btn.click(
fn=initialize_selected_folder,
inputs=[output_folder_list],
outputs=[initialization_status, folder_content_list]
).then(
fn=update_logs,
outputs=[log_output]
)
vis_btn.click(fn=update_visualization, inputs=[root_dir, output_folder_list, folder_content_list], outputs=[vis_output, vis_status]).then(
vis_btn.click(
fn=update_visualization,
inputs=[output_folder_list, folder_content_list],
outputs=[vis_output, vis_status]
).then(
fn=update_logs,
outputs=[log_output]
)
query_btn.click(
fn=send_message,
inputs=[root_dir, query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
inputs=[query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
outputs=[chatbot, query_input, log_output]
)
query_input.submit(
fn=send_message,
inputs=[root_dir, query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
inputs=[query_type, query_input, chatbot, system_message, temperature, max_tokens, model],
outputs=[chatbot, query_input, log_output]
)
refresh_models_btn.click(
Expand Down Expand Up @@ -825,5 +919,9 @@ def list_folder_contents(folder_path):
document.addEventListener('DOMContentLoaded', addShiftEnterListener);
""")


demo = demo.queue()


if __name__ == "__main__":
demo.launch()
demo.launch(share=True, reload=False)
Binary file modified graphrag/.DS_Store
Binary file not shown.
Binary file modified graphrag/index/.DS_Store
Binary file not shown.
Binary file added graphrag/index/graph/.DS_Store
Binary file not shown.
Binary file modified graphrag/index/graph/utils/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified graphrag/index/utils/__pycache__/rate_limiter.cpython-310.pyc
Binary file not shown.
Binary file added graphrag/index/verbs/.DS_Store
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified graphrag/query/.DS_Store
Binary file not shown.
Binary file modified graphrag/query/__pycache__/cli.cpython-310.pyc
Binary file not shown.
51 changes: 43 additions & 8 deletions graphrag/query/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
"""Command line interface for the query module."""

import os
import glob
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from pathlib import Path
from typing import cast

Expand Down Expand Up @@ -59,19 +66,47 @@ def run_global_search(
community_level: int,
response_type: str,
query: str,
config: GraphRagConfig,
):
"""Run a global search with the given query."""
data_dir, root_dir, config = _configure_paths_and_settings(data_dir, root_dir)
data_path = Path(data_dir)
logger.info(f"Starting global search with query: {query}")

root_dir = config.root_dir
if not root_dir:
raise ValueError("Root directory is not defined in the configuration")

output_dir = os.path.join(root_dir, "output")
logger.info(f"Output directory: {output_dir}")

if not os.path.exists(output_dir):
raise FileNotFoundError(f"Output directory does not exist: {output_dir}")

# Find the latest run directory
run_dirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d))]
if not run_dirs:
raise FileNotFoundError(f"No run directories found in {output_dir}")

latest_run = max(run_dirs) # Assumes directory names are sortable (e.g., timestamps)
logger.info(f"Latest run directory: {latest_run}")

parquet_path = os.path.join(output_dir, latest_run, "artifacts", "create_final_nodes.parquet")
logger.info(f"Looking for parquet file: {parquet_path}")

if not os.path.exists(parquet_path):
raise FileNotFoundError(f"Parquet file not found: {parquet_path}")

try:
final_nodes: pd.DataFrame = pd.read_parquet(parquet_path)
logger.info(f"Successfully read parquet file with {len(final_nodes)} rows")
except Exception as e:
logger.error(f"Error reading parquet file {parquet_path}: {str(e)}")
raise IOError(f"Error reading parquet file {parquet_path}: {str(e)}")

final_nodes: pd.DataFrame = pd.read_parquet(
data_path / "create_final_nodes.parquet"
)
final_entities: pd.DataFrame = pd.read_parquet(
data_path / "create_final_entities.parquet"
data_dir / "create_final_entities.parquet"
)
final_community_reports: pd.DataFrame = pd.read_parquet(
data_path / "create_final_community_reports.parquet"
data_dir / "create_final_community_reports.parquet"
)

reports = read_indexer_reports(
Expand Down Expand Up @@ -209,4 +244,4 @@ def _read_config_parameters(root: str):
return create_graphrag_config(data, root)

reporter.info("Reading settings from environment variables")
return create_graphrag_config(root_dir=root)
return create_graphrag_config(root_dir=root)
Binary file modified lancedb/.DS_Store
Binary file not shown.
Binary file modified lancedb/description_embedding.lance/.DS_Store
Binary file not shown.
Binary file modified lancedb/description_embedding.lance/_latest.manifest
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

$3f2722e9-ac06-467a-b742-3be376b857e2��id ���������*string08text ���������*string084vector ���������*fixed_size_list:float:76808%
attributes ���������*string08
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
 $172bd179-ce47-424f-8d9d-8a86eb9f2bad��id ���������*string08text ���������*string084vector ���������*fixed_size_list:float:76808%
attributes ���������*string08
Binary file not shown.
Loading

0 comments on commit 4484c3e

Please sign in to comment.