databricks · dsmilkov · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
@@ -53,7 +53,7 @@
 UMAP_DIM = 5
 UMAP_SEED = 42
 HDBSCAN_SELECTION_EPS = 0.05
-BATCH_SOFT_CLUSTER_NOISE = 1024
+BATCH_SOFT_CLUSTER_NOISE = 512
 
 
 def cluster_impl(
@@ -68,6 +68,7 @@ def cluster_impl(
   task_id: Optional[TaskId] = None,
   recompute_titles: bool = False,
   batch_size_titling: Optional[int] = None,
+  skip_noisy_assignment: bool = False,
 ) -> None:
   """Compute clusters for a field of the dataset."""
   topic_fn = topic_fn or generate_title_openai
@@ -154,7 +155,12 @@ def cluster_documents(items: Iterator[Item]) -> Iterator[Item]:
       cluster_items = sparse_to_dense_compute(
         docs,
         lambda x: _hdbscan_cluster(
-          x, min_cluster_size, use_garden, num_docs=total_len, task_info=task_info
+          x,
+          min_cluster_size,
+          use_garden,
+          num_docs=total_len,
+          task_info=task_info,
+          skip_noisy_assignment=skip_noisy_assignment,
         ),
       )
       for item, cluster_item in zip(items2, cluster_items):
@@ -208,7 +214,13 @@ def cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
       items, items2 = itertools.tee(items)
       docs = (item.get(CLUSTER_TITLE) for item in items)
       cluster_items = sparse_to_dense_compute(
-        docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden)
+        docs,
+        lambda x: _hdbscan_cluster(
+          x,
+          MIN_CLUSTER_SIZE_CATEGORY,
+          use_garden=use_garden,
+          skip_noisy_assignment=skip_noisy_assignment,
+        ),
       )
       for item, cluster_item in zip(items2, cluster_items):
         item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1)
@@ -298,6 +310,7 @@ def _hdbscan_cluster(
   use_garden: bool = False,
   num_docs: Optional[int] = None,
   task_info: Optional[TaskInfo] = None,
+  skip_noisy_assignment: bool = False,
 ) -> Iterator[Item]:
   """Cluster docs with HDBSCAN."""
   if use_garden:
@@ -338,9 +351,9 @@ def _hdbscan_cluster(
     from umap import UMAP
 
   dim = all_vectors[0].size
-  with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
-    n_neighbors = min(30, len(all_vectors) - 1)
-    if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
+  n_neighbors = min(30, len(all_vectors) - 1)
+  if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
+    with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
       reducer = UMAP(
         n_components=UMAP_DIM,
         n_neighbors=n_neighbors,
@@ -375,14 +388,13 @@ def _hdbscan_cluster(
     if cluster_id == -1:
       noisy_vectors.append(all_vectors[i])
   num_noisy = len(noisy_vectors)
-  perc_noisy = 100 * num_noisy / len(clusterer.labels_)
-  log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')
-
   noisy_labels: list[np.ndarray] = []
   noisy_probs: list[np.ndarray] = []
   labels = clusterer.labels_
   memberships = clusterer.probabilities_
-  if num_noisy > 0 and num_noisy < len(clusterer.labels_):
+  if not skip_noisy_assignment and num_noisy > 0 and num_noisy < len(clusterer.labels_):
+    perc_noisy = 100 * num_noisy / len(clusterer.labels_)
+    log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')
     with DebugTimer('HDBSCAN: Computing membership for the noise points'):
       for batch_noisy_vectors in chunks(noisy_vectors, BATCH_SOFT_CLUSTER_NOISE):
         batch_noisy_vectors = np.array(batch_noisy_vectors, dtype=np.float32)

diff --git a/lilac/data/clustering_test.py b/lilac/data/clustering_test.py
@@ -62,9 +62,9 @@ def compute(docs: list[str]) -> list[Item]:
       if 'summar' in doc or 'hello' in doc or 'greeting' in doc:
         result.append([chunk_embedding(0, len(doc), np.array([1, 1, 1]))])
       elif 'simpl' in doc or 'whats' in doc or 'time' in doc:
-        result.append([chunk_embedding(0, len(doc), np.array([0, 0, 0]))])
+        result.append([chunk_embedding(0, len(doc), np.array([-1, -1, -1]))])
       else:
-        result.append([chunk_embedding(0, len(doc), np.array([0.5, 0.5, 0.5]))])
+        result.append([chunk_embedding(0, len(doc), np.array([100, 0, -100]))])
     return result
 
   mocker.patch.object(JinaV2Small, 'compute', side_effect=compute)
@@ -718,3 +718,93 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
       },
     },
   ]
+
+
+def test_clusters_skip_noisy_assignment(
+  make_test_data: TestDataMaker, mocker: MockerFixture
+) -> None:
+  texts: list[str] = [
+    'Can you summarize this article',
+    'Can you rewrite this in a simpler way',
+    'Can you provide a short summary of the following text',
+    'Can you simplify this text',
+    'Hello world',
+  ]
+  dataset = make_test_data([{'text': t} for t in texts])
+
+  def topic_fn(docs: list[tuple[str, float]]) -> str:
+    if 'summar' in docs[0][0]:
+      return 'summarization'
+    elif 'simpl' in docs[0][0]:
+      return 'simplification'
+    return 'other'
+
+  mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
+  _mock_jina(mocker)
+
+  dataset.cluster(
+    'text',
+    min_cluster_size=2,
+    topic_fn=topic_fn,
+    category_fn=lambda _: 'MockCategory',
+    skip_noisy_assignment=True,
+  )
+
+  rows = list(dataset.select_rows(['text', 'text__cluster'], combine_columns=True))
+  assert rows == [
+    {
+      'text': 'Can you summarize this article',
+      'text__cluster': {
+        'cluster_id': 0,
+        'cluster_membership_prob': 1.0,
+        'cluster_title': 'summarization',
+        'category_id': 0,
+        'category_membership_prob': 1.0,
+        'category_title': 'MockCategory',
+      },
+    },
+    {
+      'text': 'Can you rewrite this in a simpler way',
+      'text__cluster': {
+        'cluster_id': 1,
+        'cluster_membership_prob': 1.0,
+        'cluster_title': 'simplification',
+        'category_id': 1,
+        'category_membership_prob': 1.0,
+        'category_title': 'MockCategory',
+      },
+    },
+    {
+      'text': 'Can you provide a short summary of the following text',
+      'text__cluster': {
+        'cluster_id': 0,
+        'cluster_membership_prob': 1.0,
+        'cluster_title': 'summarization',
+        'category_id': 0,
+        'category_membership_prob': 1.0,
+        'category_title': 'MockCategory',
+      },
+    },
+    {
+      'text': 'Can you simplify this text',
+      'text__cluster': {
+        'cluster_id': 1,
+        'cluster_membership_prob': 1.0,
+        'cluster_title': 'simplification',
+        'category_id': 1,
+        'category_membership_prob': 1.0,
+        'category_title': 'MockCategory',
+      },
+    },
+    {
+      'text': 'Hello world',
+      'text__cluster': {
+        'cluster_id': -1,
+        'cluster_membership_prob': 0.0,
+        'cluster_title': None,
+        'category_id': -1,
+        'category_membership_prob': 0.0,
+        'category_title': None,
+      },
+    },
+  ]
diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py
@@ -506,6 +506,7 @@ def cluster(
     task_id: Optional[TaskId] = None,
     # TODO(0.4.0): colocate with topic_fn.
     category_fn: Optional[TopicFn] = None,
+    skip_noisy_assignment: bool = False,
   ) -> None:
     """Compute clusters for a field of the dataset.
 
@@ -524,6 +525,9 @@ def cluster(
         of the task.
       category_fn: A function that returns a category for a set of related titles. It takes a list
         of (doc, membership_score) tuples and returns a single category name.
+      skip_noisy_assignment: If true, noisy points will not be assigned to the nearest cluster.
+        This only has an effect when the clustering is done locally (use_garden=False) and will
+        speedup clustering.
 
     """
     pass

diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
@@ -3334,6 +3334,7 @@ def cluster(
     use_garden: bool = False,
     task_id: Optional[TaskId] = None,
     category_fn: Optional[TopicFn] = cluster_titling.generate_category_openai,
+    skip_noisy_assignment: bool = False,
   ) -> None:
     topic_fn = topic_fn or cluster_titling.generate_title_openai
     category_fn = category_fn or cluster_titling.generate_category_openai
@@ -3347,6 +3348,7 @@ def cluster(
       overwrite=overwrite,
       use_garden=use_garden,
       task_id=task_id,
+      skip_noisy_assignment=skip_noisy_assignment,
     )
 
   @override
@@ -3950,6 +3952,7 @@ def _auto_bins(stats: StatsResult) -> list[Bin]:
     return [('0', const_val, None)]
 
   is_integer = stats.value_samples and all(isinstance(val, int) for val in stats.value_samples)
+
   def _round(value: float) -> float:
     # Select a round ndigits as a function of the value range. We offset it by 2 to allow for some
     # decimal places as a function of the range.