Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skip_noisy_assignment to dataset.cluster #1194

Merged
merged 3 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
UMAP_DIM = 5
UMAP_SEED = 42
HDBSCAN_SELECTION_EPS = 0.05
BATCH_SOFT_CLUSTER_NOISE = 1024
BATCH_SOFT_CLUSTER_NOISE = 512


def cluster_impl(
Expand All @@ -68,6 +68,7 @@ def cluster_impl(
task_id: Optional[TaskId] = None,
recompute_titles: bool = False,
batch_size_titling: Optional[int] = None,
skip_noisy_assignment: bool = False,
dsmilkov marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""Compute clusters for a field of the dataset."""
topic_fn = topic_fn or generate_title_openai
Expand Down Expand Up @@ -154,7 +155,12 @@ def cluster_documents(items: Iterator[Item]) -> Iterator[Item]:
cluster_items = sparse_to_dense_compute(
docs,
lambda x: _hdbscan_cluster(
x, min_cluster_size, use_garden, num_docs=total_len, task_info=task_info
x,
min_cluster_size,
use_garden,
num_docs=total_len,
task_info=task_info,
skip_noisy_assignment=skip_noisy_assignment,
),
)
for item, cluster_item in zip(items2, cluster_items):
Expand Down Expand Up @@ -208,7 +214,13 @@ def cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
items, items2 = itertools.tee(items)
docs = (item.get(CLUSTER_TITLE) for item in items)
cluster_items = sparse_to_dense_compute(
docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden)
docs,
lambda x: _hdbscan_cluster(
x,
MIN_CLUSTER_SIZE_CATEGORY,
use_garden=use_garden,
skip_noisy_assignment=skip_noisy_assignment,
),
)
for item, cluster_item in zip(items2, cluster_items):
item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1)
Expand Down Expand Up @@ -298,6 +310,7 @@ def _hdbscan_cluster(
use_garden: bool = False,
num_docs: Optional[int] = None,
task_info: Optional[TaskInfo] = None,
skip_noisy_assignment: bool = False,
) -> Iterator[Item]:
"""Cluster docs with HDBSCAN."""
if use_garden:
Expand Down Expand Up @@ -338,9 +351,9 @@ def _hdbscan_cluster(
from umap import UMAP

dim = all_vectors[0].size
with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
n_neighbors = min(30, len(all_vectors) - 1)
if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
n_neighbors = min(30, len(all_vectors) - 1)
if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
reducer = UMAP(
n_components=UMAP_DIM,
n_neighbors=n_neighbors,
Expand Down Expand Up @@ -375,14 +388,13 @@ def _hdbscan_cluster(
if cluster_id == -1:
noisy_vectors.append(all_vectors[i])
num_noisy = len(noisy_vectors)
perc_noisy = 100 * num_noisy / len(clusterer.labels_)
log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')

noisy_labels: list[np.ndarray] = []
noisy_probs: list[np.ndarray] = []
labels = clusterer.labels_
memberships = clusterer.probabilities_
if num_noisy > 0 and num_noisy < len(clusterer.labels_):
if not skip_noisy_assignment and num_noisy > 0 and num_noisy < len(clusterer.labels_):
perc_noisy = 100 * num_noisy / len(clusterer.labels_)
log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')
with DebugTimer('HDBSCAN: Computing membership for the noise points'):
for batch_noisy_vectors in chunks(noisy_vectors, BATCH_SOFT_CLUSTER_NOISE):
batch_noisy_vectors = np.array(batch_noisy_vectors, dtype=np.float32)
Expand Down
94 changes: 92 additions & 2 deletions lilac/data/clustering_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def compute(docs: list[str]) -> list[Item]:
if 'summar' in doc or 'hello' in doc or 'greeting' in doc:
result.append([chunk_embedding(0, len(doc), np.array([1, 1, 1]))])
elif 'simpl' in doc or 'whats' in doc or 'time' in doc:
result.append([chunk_embedding(0, len(doc), np.array([0, 0, 0]))])
result.append([chunk_embedding(0, len(doc), np.array([-1, -1, -1]))])
else:
result.append([chunk_embedding(0, len(doc), np.array([0.5, 0.5, 0.5]))])
result.append([chunk_embedding(0, len(doc), np.array([100, 0, -100]))])
return result

mocker.patch.object(JinaV2Small, 'compute', side_effect=compute)
Expand Down Expand Up @@ -718,3 +718,93 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
},
},
]


def test_clusters_skip_noisy_assignment(
make_test_data: TestDataMaker, mocker: MockerFixture
) -> None:
texts: list[str] = [
'Can you summarize this article',
'Can you rewrite this in a simpler way',
'Can you provide a short summary of the following text',
'Can you simplify this text',
'Hello world',
]
dataset = make_test_data([{'text': t} for t in texts])

def topic_fn(docs: list[tuple[str, float]]) -> str:
if 'summar' in docs[0][0]:
return 'summarization'
elif 'simpl' in docs[0][0]:
return 'simplification'
return 'other'

mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
_mock_jina(mocker)

dataset.cluster(
'text',
min_cluster_size=2,
topic_fn=topic_fn,
category_fn=lambda _: 'MockCategory',
skip_noisy_assignment=True,
)

rows = list(dataset.select_rows(['text', 'text__cluster'], combine_columns=True))
assert rows == [
{
'text': 'Can you summarize this article',
'text__cluster': {
'cluster_id': 0,
'cluster_membership_prob': 1.0,
'cluster_title': 'summarization',
'category_id': 0,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you rewrite this in a simpler way',
'text__cluster': {
'cluster_id': 1,
'cluster_membership_prob': 1.0,
'cluster_title': 'simplification',
'category_id': 1,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you provide a short summary of the following text',
'text__cluster': {
'cluster_id': 0,
'cluster_membership_prob': 1.0,
'cluster_title': 'summarization',
'category_id': 0,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you simplify this text',
'text__cluster': {
'cluster_id': 1,
'cluster_membership_prob': 1.0,
'cluster_title': 'simplification',
'category_id': 1,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Hello world',
'text__cluster': {
'cluster_id': -1,
'cluster_membership_prob': 0.0,
'cluster_title': None,
'category_id': -1,
'category_membership_prob': 0.0,
'category_title': None,
},
},
]
4 changes: 4 additions & 0 deletions lilac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ def cluster(
task_id: Optional[TaskId] = None,
# TODO(0.4.0): colocate with topic_fn.
category_fn: Optional[TopicFn] = None,
skip_noisy_assignment: bool = False,
dsmilkov marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""Compute clusters for a field of the dataset.

Expand All @@ -524,6 +525,9 @@ def cluster(
of the task.
category_fn: A function that returns a category for a set of related titles. It takes a list
of (doc, membership_score) tuples and returns a single category name.
skip_noisy_assignment: If true, noisy points will not be assigned to the nearest cluster.
This only has an effect when the clustering is done locally (use_garden=False) and will
speedup clustering.

"""
pass
Expand Down
3 changes: 3 additions & 0 deletions lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3334,6 +3334,7 @@ def cluster(
use_garden: bool = False,
task_id: Optional[TaskId] = None,
category_fn: Optional[TopicFn] = cluster_titling.generate_category_openai,
skip_noisy_assignment: bool = False,
) -> None:
topic_fn = topic_fn or cluster_titling.generate_title_openai
category_fn = category_fn or cluster_titling.generate_category_openai
Expand All @@ -3347,6 +3348,7 @@ def cluster(
overwrite=overwrite,
use_garden=use_garden,
task_id=task_id,
skip_noisy_assignment=skip_noisy_assignment,
)

@override
Expand Down Expand Up @@ -3950,6 +3952,7 @@ def _auto_bins(stats: StatsResult) -> list[Bin]:
return [('0', const_val, None)]

is_integer = stats.value_samples and all(isinstance(val, int) for val in stats.value_samples)

def _round(value: float) -> float:
# Select a round ndigits as a function of the value range. We offset it by 2 to allow for some
# decimal places as a function of the range.
Expand Down
Loading