Skip to content

Commit

Permalink
Updates to chroma count
Browse files Browse the repository at this point in the history
 - Updated the chromaviewmaster to look at the count + unique documents providing the database has meta data
- Updated a ne chroma-test.py simple test utility to be able to check the data store for integratity and collection names. Only meant as a quick test nothing else.
  • Loading branch information
clearsitedesigns committed May 28, 2024
1 parent c86efaf commit 93cded0
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
service-test_knowledge_graph.html
112 changes: 112 additions & 0 deletions chroma-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import chromadb
import pandas as pd
import streamlit as st
import argparse


#This is just a quick test utility to validate datastore

# Set up argument parser for command line arguments
parser = argparse.ArgumentParser(description='Set the Chroma DB path')
parser.add_argument('db', help='Path to the Chroma DB directory')

# Function to render collection details
def render_collection_details(collection):
st.subheader(f"Collection: {collection.name}")

# Get the collection data
data = collection.get()
df = pd.DataFrame.from_dict(data)

# List all available columns in the collection
st.write("Available Columns in Collection:")
st.write(df.columns.tolist())

# Display the fields in the collection
st.write("Collection Fields:")
for field, value in data.items():
if value is not None:
st.write(f"{field}: {len(value)}")
else:
st.write(f"{field}: None")

# Display the columns available in each part of the collection
if 'ids' in df.columns:
st.write("Columns in 'ids':")
st.write(pd.DataFrame(df['ids']).columns.tolist())

if 'embeddings' in df.columns:
st.write("Columns in 'embeddings':")
if df['embeddings'] is not None and len(df['embeddings']) > 0:
st.write(pd.DataFrame(df['embeddings']).columns.tolist())
else:
st.write("None")

if 'metadatas' in df.columns:
st.write("Columns in 'metadatas':")
if df['metadatas'] is not None and len(df['metadatas']) > 0:
st.write(pd.DataFrame(df['metadatas']).columns.tolist())
else:
st.write("None")

if 'documents' in df.columns:
st.write("Columns in 'documents':")
if df['documents'] is not None and len(df['documents']) > 0:
st.write(pd.DataFrame(df['documents']).columns.tolist())
else:
st.write("None")

# Get the unique document count based on the 'url' field in metadata
if 'metadatas' in df.columns:
# Extract URLs from metadata
urls = [meta.get('url') for meta in df['metadatas'] if meta.get('url') is not None]
unique_urls = pd.Series(urls).unique()
num_documents = len(unique_urls)
st.write(f"Number of Documents (unique URLs): {num_documents}")

# Display the unique URLs for debugging purposes
st.write("Unique URLs:")
st.write(unique_urls)

# Display the first few documents associated with unique URLs for verification
st.write("Sample documents associated with unique URLs:")
sample_docs = df[df['metadatas'].apply(lambda x: x.get('url') in unique_urls)].head(10)
st.write(sample_docs)
else:
st.write("The collection does not contain a 'metadatas' column.")

# Display the detailed data for each field
st.write("Detailed Collection Data:")
for field, value in data.items():
st.write(f"{field}:")
if isinstance(value, list):
if len(value) > 0 and isinstance(value[0], dict):
df_field = pd.DataFrame(value)
st.write(df_field)
else:
st.write(value)
else:
st.write(value)

# Main function to handle the Streamlit app
def main():
st.title("Chroma Collection Details")

args = parser.parse_args()
db_path = args.db

st.write(f"DB Path: {db_path}")

client = chromadb.PersistentClient(path=db_path)

collections = client.list_collections()
collection_names = [collection.name for collection in collections]

selected_collection = st.selectbox("Select a collection", collection_names)

if selected_collection:
collection = client.get_collection(name=selected_collection)
render_collection_details(collection)

if __name__ == "__main__":
main()
70 changes: 55 additions & 15 deletions chromaViewMaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,23 +92,49 @@ def preprocess_for_knowledge_graph(df):
entity_freq = Counter(entities)
return entities, relationships, entity_freq

def render_collection_statistics(df):
def display_unique_document_count(df):
if 'metadatas' in df.columns:
# Extract URLs from metadata
urls = [meta.get('url') for meta in df['metadatas'] if meta.get('url') is not None]
unique_urls = pd.Series(urls).unique()
num_documents = len(unique_urls)
st.write(f"Number of Documents (unique URLs): {num_documents}")

# Display the unique URLs for debugging purposes
st.write("Unique URLs:")
st.write(unique_urls)

# Display the first few documents associated with unique URLs for verification
st.write("Sample documents associated with unique URLs:")
sample_docs = df[df['metadatas'].apply(lambda x: x.get('url') in unique_urls)].head(10)
st.write(sample_docs)
else:
st.write("The collection does not contain a 'metadatas' column.")

def render_collection_statistics(collection):
st.subheader("Collection Statistics")
st.write("View statistics on the current collection.")

num_documents = len(df)

# Get the collection data
data = collection.get()
df = pd.DataFrame.from_dict(data)

# Get the unique document count based on the 'url' field in metadata
display_unique_document_count(df)

# Calculate other statistics
doc_lengths = df['documents'].apply(len)
avg_doc_length = doc_lengths.mean()
median_doc_length = doc_lengths.median()
std_doc_length = doc_lengths.std()

most_common_words = Counter(" ".join(df['documents']).split()).most_common(10)
most_common_words_str = ", ".join([word for word, count in most_common_words])

entities = [ent.text for doc in nlp.pipe(df['documents'].astype(str)) for ent in doc.ents]
most_common_entities = Counter(entities).most_common(10)
most_common_entities_str = ", ".join([entity for entity, count in most_common_entities])

df['sentiment'] = df['documents'].apply(lambda x: TextBlob(x).sentiment.polarity)
sentiment_counts = df['sentiment'].value_counts(bins=3, sort=False)
sentiment_distribution = {
Expand All @@ -117,8 +143,9 @@ def render_collection_statistics(df):
'Positive': sentiment_counts.iloc[2]
}

# Create a dictionary with the statistics
statistics = {
"Number of Documents": num_documents,
"Number of Documents": len(df['documents']),
"Average Document Length": avg_doc_length,
"Median Document Length": median_doc_length,
"Standard Deviation of Document Length": std_doc_length,
Expand All @@ -128,10 +155,11 @@ def render_collection_statistics(df):
"Neutral Sentiment Documents": sentiment_distribution['Neutral'],
"Positive Sentiment Documents": sentiment_distribution['Positive']
}

# Convert all values to strings to avoid conversion errors
statistics_str = {k: str(v) for k, v in statistics.items()}


# Display the statistics as a table
st.table(pd.DataFrame(statistics_str.items(), columns=['Statistic', 'Value']))

# Visualization Functions
Expand Down Expand Up @@ -664,7 +692,6 @@ def render_sentiment_analysis(df):
# Sentiment vs. Document Length
fig_sent_len = px.scatter(df, x='document_length', y='sentiment', title="Sentiment vs. Document Length")
st.plotly_chart(fig_sent_len)


def render_network_centrality(G, collection_name):
centrality = nx.degree_centrality(G)
Expand All @@ -675,18 +702,31 @@ def render_network_centrality(G, collection_name):
def view_collections(dir):
st.header("ChromaView Master 1.1")
st.markdown("DB Path: %s" % dir)
st.write("Below are the collections found in the specified Chroma DB path. You can explore the collections by performing various actions like topic modeling, similarity search, and knowledge graph visualization. Select a chroma collection below. All tools were massively upgrade. Some may take longer to load.")
st.write("Below are the collections found in the specified Chroma DB path. You can explore the collections by performing various actions like topic modeling, similarity search, and knowledge graph visualization. Select a chroma collection below. All tools were massively upgraded. Some may take longer to load.")

version = "1.1" # Replace with your actual versioning logic

st.markdown(f"""
**ChromaView Master {version} empowers data scientists and analysts to gain a comprehensive understanding of their Chroma DB collections.**
Leveraging techniques like Latent Dirichlet Allocation (LDA) for topic modeling and spaCy for entity extraction, the tool provides:
* **Advanced Analysis:** Go beyond basic statistics with in-depth visualizations.
* **Customizable Exploration:** Choose the analysis methods that best suit your needs.
* **Interactive Interface:** Streamlit-based for easy navigation and exploration.
* **Open Source:** Built with flexibility and extensibility in mind.
""")

client = chromadb.PersistentClient(path=dir)

collections = client.list_collections()
collection_names = [collection.name for collection in collections]
selected_collection_name = st.selectbox("**Begin By Selecting a Collection**", collection_names)


if selected_collection_name:
for collection in collections:
if collection.name == selected_collection_name:
data = collection.get()
current_collection = collection # Store the actual collection object
data = current_collection.get()
ids = data['ids']
embeddings = data["embeddings"]
metadata = data["metadatas"]
Expand Down Expand Up @@ -714,7 +754,7 @@ def view_collections(dir):
st.write("Your output from the left selection will appear here.")

if view_collection_statistics:
render_collection_statistics(df)
render_collection_statistics(collection)

if perform_topic_modeling:
render_topic_modeling(df, collection.name)
Expand Down
2 changes: 2 additions & 0 deletions config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ baseUrlPath = "/static"
maxMessageSize = 600 # Set to 500 MB (or adjust as needed)


[server]
fileWatcherType = "poll"

0 comments on commit 93cded0

Please sign in to comment.