Skip to content

Commit

Permalink
fix: skip over HDBSCAN for small cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
danellecline committed Jul 31, 2024
1 parent 59e160f commit cbf1ea3
Showing 1 changed file with 18 additions and 15 deletions.
33 changes: 18 additions & 15 deletions sdcat/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,29 @@ def _run_hdbscan_assign(
x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1

# Cluster the embeddings using HDBSCAN
if have_gpu:
scan = cuHDBSCAN(
metric='euclidean', # 'precomputed' does not work with cuHDBSCAN
allow_single_cluster=True,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
alpha=alpha,
cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method).fit_predict(x)
labels = scan.fit_predict(x)
if len(df) == 1:
labels = np.array([0])
else:
scan = HDBSCAN(
metric='l2',
if have_gpu:
scan = cuHDBSCAN(
metric='euclidean', # 'precomputed' does not work with cuHDBSCAN
allow_single_cluster=True,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
alpha=alpha,
cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method)
labels = scan.fit_predict(x)
cluster_selection_method=cluster_selection_method).fit_predict(x)
labels = scan.fit_predict(x)
else:
scan = HDBSCAN(
metric='l2',
allow_single_cluster=True,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
alpha=alpha,
cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method)
labels = scan.fit_predict(x)

# Get the unique clusters and sort them; -1 are unassigned clusters
cluster_df = pd.DataFrame(labels, columns=['cluster'])
Expand All @@ -126,7 +129,7 @@ def _run_hdbscan_assign(

# If all the clusters are unassigned, then use all the samples as exemplars,
# and assign them to the unknown cluster. If embedding is empty, this is also the case (failed to extract embeddings)
if len(unique_clusters) == 1 and unique_clusters[0] == -1:
if len(unique_clusters) == 1 and unique_clusters[0] == -1 or len(x) == 1:
avg_sim_scores = []
exemplar_df = pd.DataFrame()
exemplar_df['cluster'] = len(x) * ['Unknown']
Expand Down

0 comments on commit cbf1ea3

Please sign in to comment.