diff --git a/notebooks/Clustering copy.ipynb b/notebooks/Clustering copy.ipynb deleted file mode 100644 index 84a3ffbc..00000000 --- a/notebooks/Clustering copy.ipynb +++ /dev/null @@ -1,120 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Clustering\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook accompanies the [Cluster a dataset](https://docs.lilacml.com/datasets/dataset_cluster.html) guide.\n", - "Let's start by loading a small dataset of multi-turn conversations between a human and a chatbot:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset \"capybara\" written to ./datasets/local/capybara\n" - ] - } - ], - "source": [ - "import lilac as ll\n", - "\n", - "ds = ll.get_dataset('local', 'OpenHermes-2.5-100k')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can cluster the `input` field under the `conversation` array by calling:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[local/capybara][1 shards] map \"extract_text\" to \"('conversation_input__cluster',)\": 100%|██████████| 16006/16006 [00:00<00:00, 30424.61it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wrote map output to conversation_input__cluster-00000-of-00001.parquet\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[local/capybara][1 shards] map \"compute_clusters\" to \"('conversation_input__cluster',)\": 0%| | 0/16006 [00:00