Skip to content

Commit

Permalink
Add skip_noisy_assignment to dataset.cluster (#1194)
Browse files Browse the repository at this point in the history
When `skip_noisy_assignment=True`, we skip an expensive step for
assigning noisy points to nearest clusters. It's only effectively when
running locally.
  • Loading branch information
dsmilkov authored Feb 28, 2024
1 parent 8965e98 commit f74d88d
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 13 deletions.
39 changes: 29 additions & 10 deletions lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
UMAP_DIM = 5
UMAP_SEED = 42
HDBSCAN_SELECTION_EPS = 0.05
BATCH_SOFT_CLUSTER_NOISE = 1024
BATCH_SOFT_CLUSTER_NOISE = 512


def cluster_impl(
Expand All @@ -68,6 +68,7 @@ def cluster_impl(
task_id: Optional[TaskId] = None,
recompute_titles: bool = False,
batch_size_titling: Optional[int] = None,
skip_noisy_assignment: bool = False,
) -> None:
"""Compute clusters for a field of the dataset."""
topic_fn = topic_fn or generate_title_openai
Expand Down Expand Up @@ -108,6 +109,13 @@ def cluster_impl(
else:
raise ValueError('input must be provided.')

if use_garden and skip_noisy_assignment:
raise ValueError(
'`use_garden` and `skip_noisy_assignment` cannot both be True. '
'The garden implementation is heavily optimizied and will always '
'assign noisy points to the nearest cluster.'
)

# Extract the text from the input path into a temporary column.
TEXT_COLUMN = 'text'
temp_text_path = (*cluster_output_path, TEXT_COLUMN)
Expand Down Expand Up @@ -154,7 +162,12 @@ def cluster_documents(items: Iterator[Item]) -> Iterator[Item]:
cluster_items = sparse_to_dense_compute(
docs,
lambda x: _hdbscan_cluster(
x, min_cluster_size, use_garden, num_docs=total_len, task_info=task_info
x,
min_cluster_size,
use_garden,
num_docs=total_len,
task_info=task_info,
skip_noisy_assignment=skip_noisy_assignment,
),
)
for item, cluster_item in zip(items2, cluster_items):
Expand Down Expand Up @@ -208,7 +221,13 @@ def cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
items, items2 = itertools.tee(items)
docs = (item.get(CLUSTER_TITLE) for item in items)
cluster_items = sparse_to_dense_compute(
docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden)
docs,
lambda x: _hdbscan_cluster(
x,
MIN_CLUSTER_SIZE_CATEGORY,
use_garden=use_garden,
skip_noisy_assignment=skip_noisy_assignment,
),
)
for item, cluster_item in zip(items2, cluster_items):
item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1)
Expand Down Expand Up @@ -298,6 +317,7 @@ def _hdbscan_cluster(
use_garden: bool = False,
num_docs: Optional[int] = None,
task_info: Optional[TaskInfo] = None,
skip_noisy_assignment: bool = False,
) -> Iterator[Item]:
"""Cluster docs with HDBSCAN."""
if use_garden:
Expand Down Expand Up @@ -338,9 +358,9 @@ def _hdbscan_cluster(
from umap import UMAP

dim = all_vectors[0].size
with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
n_neighbors = min(30, len(all_vectors) - 1)
if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
n_neighbors = min(30, len(all_vectors) - 1)
if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
reducer = UMAP(
n_components=UMAP_DIM,
n_neighbors=n_neighbors,
Expand Down Expand Up @@ -375,14 +395,13 @@ def _hdbscan_cluster(
if cluster_id == -1:
noisy_vectors.append(all_vectors[i])
num_noisy = len(noisy_vectors)
perc_noisy = 100 * num_noisy / len(clusterer.labels_)
log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')

noisy_labels: list[np.ndarray] = []
noisy_probs: list[np.ndarray] = []
labels = clusterer.labels_
memberships = clusterer.probabilities_
if num_noisy > 0 and num_noisy < len(clusterer.labels_):
if not skip_noisy_assignment and num_noisy > 0 and num_noisy < len(clusterer.labels_):
perc_noisy = 100 * num_noisy / len(clusterer.labels_)
log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')
with DebugTimer('HDBSCAN: Computing membership for the noise points'):
for batch_noisy_vectors in chunks(noisy_vectors, BATCH_SOFT_CLUSTER_NOISE):
batch_noisy_vectors = np.array(batch_noisy_vectors, dtype=np.float32)
Expand Down
94 changes: 92 additions & 2 deletions lilac/data/clustering_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def compute(docs: list[str]) -> list[Item]:
if 'summar' in doc or 'hello' in doc or 'greeting' in doc:
result.append([chunk_embedding(0, len(doc), np.array([1, 1, 1]))])
elif 'simpl' in doc or 'whats' in doc or 'time' in doc:
result.append([chunk_embedding(0, len(doc), np.array([0, 0, 0]))])
result.append([chunk_embedding(0, len(doc), np.array([-1, -1, -1]))])
else:
result.append([chunk_embedding(0, len(doc), np.array([0.5, 0.5, 0.5]))])
result.append([chunk_embedding(0, len(doc), np.array([100, 0, -100]))])
return result

mocker.patch.object(JinaV2Small, 'compute', side_effect=compute)
Expand Down Expand Up @@ -718,3 +718,93 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
},
},
]


def test_clusters_skip_noisy_assignment(
make_test_data: TestDataMaker, mocker: MockerFixture
) -> None:
texts: list[str] = [
'Can you summarize this article',
'Can you rewrite this in a simpler way',
'Can you provide a short summary of the following text',
'Can you simplify this text',
'Hello world',
]
dataset = make_test_data([{'text': t} for t in texts])

def topic_fn(docs: list[tuple[str, float]]) -> str:
if 'summar' in docs[0][0]:
return 'summarization'
elif 'simpl' in docs[0][0]:
return 'simplification'
return 'other'

mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
_mock_jina(mocker)

dataset.cluster(
'text',
min_cluster_size=2,
topic_fn=topic_fn,
category_fn=lambda _: 'MockCategory',
skip_noisy_assignment=True,
)

rows = list(dataset.select_rows(['text', 'text__cluster'], combine_columns=True))
assert rows == [
{
'text': 'Can you summarize this article',
'text__cluster': {
'cluster_id': 0,
'cluster_membership_prob': 1.0,
'cluster_title': 'summarization',
'category_id': 0,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you rewrite this in a simpler way',
'text__cluster': {
'cluster_id': 1,
'cluster_membership_prob': 1.0,
'cluster_title': 'simplification',
'category_id': 1,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you provide a short summary of the following text',
'text__cluster': {
'cluster_id': 0,
'cluster_membership_prob': 1.0,
'cluster_title': 'summarization',
'category_id': 0,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you simplify this text',
'text__cluster': {
'cluster_id': 1,
'cluster_membership_prob': 1.0,
'cluster_title': 'simplification',
'category_id': 1,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Hello world',
'text__cluster': {
'cluster_id': -1,
'cluster_membership_prob': 0.0,
'cluster_title': None,
'category_id': -1,
'category_membership_prob': 0.0,
'category_title': None,
},
},
]
4 changes: 4 additions & 0 deletions lilac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ def cluster(
task_id: Optional[TaskId] = None,
# TODO(0.4.0): colocate with topic_fn.
category_fn: Optional[TopicFn] = None,
skip_noisy_assignment: bool = False,
) -> None:
"""Compute clusters for a field of the dataset.
Expand All @@ -524,6 +525,9 @@ def cluster(
of the task.
category_fn: A function that returns a category for a set of related titles. It takes a list
of (doc, membership_score) tuples and returns a single category name.
skip_noisy_assignment: If true, noisy points will not be assigned to the nearest cluster.
This only has an effect when the clustering is done locally (use_garden=False) and will
speedup clustering.
"""
pass
Expand Down
3 changes: 3 additions & 0 deletions lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3334,6 +3334,7 @@ def cluster(
use_garden: bool = False,
task_id: Optional[TaskId] = None,
category_fn: Optional[TopicFn] = cluster_titling.generate_category_openai,
skip_noisy_assignment: bool = False,
) -> None:
topic_fn = topic_fn or cluster_titling.generate_title_openai
category_fn = category_fn or cluster_titling.generate_category_openai
Expand All @@ -3347,6 +3348,7 @@ def cluster(
overwrite=overwrite,
use_garden=use_garden,
task_id=task_id,
skip_noisy_assignment=skip_noisy_assignment,
)

@override
Expand Down Expand Up @@ -3950,6 +3952,7 @@ def _auto_bins(stats: StatsResult) -> list[Bin]:
return [('0', const_val, None)]

is_integer = stats.value_samples and all(isinstance(val, int) for val in stats.value_samples)

def _round(value: float) -> float:
# Select a round ndigits as a function of the value range. We offset it by 2 to allow for some
# decimal places as a function of the range.
Expand Down
6 changes: 6 additions & 0 deletions lilac/router_dataset_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ class ClusterOptions(BaseModel):
use_garden: bool = PydanticField(
default=False, description='Accelerate computation by running remotely on Lilac Garden.'
)
skip_noisy_assignment: bool = PydanticField(
default=False,
description='Skip assignment of noisy points to the nearest cluster to speed up clustering.',
)

overwrite: bool = False


Expand Down Expand Up @@ -145,6 +150,7 @@ def run() -> None:
use_garden=options.use_garden,
overwrite=options.overwrite,
task_id=task_id,
skip_noisy_assignment=options.skip_noisy_assignment,
)

launch_task(task_id, run)
Expand Down
18 changes: 17 additions & 1 deletion web/blueprint/src/lib/components/ComputeClusterModal.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
input: Path;
output_path?: Path;
use_garden?: boolean;
skip_noisy_assignment?: boolean;
overwrite?: boolean;
};
Expand Down Expand Up @@ -101,7 +102,8 @@
use_garden: options.use_garden,
output_path: outputColumn,
input_selector: selectedFormatSelector,
overwrite: options.overwrite
overwrite: options.overwrite,
skip_noisy_assignment: options.skip_noisy_assignment
}
]);
close();
Expand Down Expand Up @@ -173,6 +175,20 @@
</div>
{/if}
</div>

<div>
<div class="label mb-2 font-medium text-gray-700">Skip noisy assignment</div>
<div class="label text-sm text-gray-700">
Skip assignment of noisy points to the nearest cluster to speed up clustering.
</div>
<Toggle
labelA={'False'}
labelB={'True'}
bind:toggled={options.skip_noisy_assignment}
hideLabel
/>
</div>

<div>
<div class="label text-s mb-2 font-medium text-gray-700">Overwrite</div>
<Toggle labelA={'False'} labelB={'True'} bind:toggled={options.overwrite} hideLabel />
Expand Down
4 changes: 4 additions & 0 deletions web/lib/fastapi_client/models/ClusterOptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ export type ClusterOptions = {
* Accelerate computation by running remotely on Lilac Garden.
*/
use_garden?: boolean;
/**
* Skip assignment of noisy points to the nearest cluster to speed up clustering.
*/
skip_noisy_assignment?: boolean;
overwrite?: boolean;
};

0 comments on commit f74d88d

Please sign in to comment.