Deploy dataset (#1085)

deploy_project(_operations) no longer does any dataset uploading; that is expected to be handled by the caller. Deploy is purely about uploading the project metadata and configuring the hf space. There are four entry points to deploy_project_operations to double-check: - CLI; not sure how this is used - deploy_demo; this has an explicit project config at lilac_hf_space.yml that should be the source of truth, even if we only fiddle with one dataset at a time - deploy_staging; this has no explicit project config and has to be dynamically constructed based on the dataset that gets uploaded - deployer app; this constructs a project config and does no data upload (relies on the hf docker start's load() function to download from HF)
databricks · Jan 19, 2024 · 1ecef77 · 1ecef77
1 parent 4a1c4d3
commit 1ecef77
Show file tree

Hide file tree

Showing 10 changed files with 159 additions and 163 deletions.
diff --git a/lilac/cli.py b/lilac/cli.py
@@ -12,8 +12,9 @@
 from .env import env, get_project_dir
 from .hf_docker_start import hf_docker_start
 from .load import load
-from .project import dir_is_project, init, project_dir_from_args
+from .project import dir_is_project, init, project_dir_from_args, read_project_config
 from .server import start_server
+from .utils import get_hf_dataset_repo_id
 
 
 @click.command()
@@ -185,22 +186,30 @@ def deploy_project_command(
   hf_token: Optional[str],
 ) -> None:
   """Deploy a project directory to a HuggingFace Space."""
-  # When datasets aren't define, set to None so we upload all datasets.
-  if not dataset:
-    dataset = None
+  # When datasets aren't defined, upload all datasets by default.
+  if dataset is None:
+    project_config = read_project_config(project_dir)
+    dataset = [f'{d.namespace}/{d.name}' for d in project_config.datasets]
   # When datasets aren't defined, set to None so we upload all datasets.
   if not concept:
     concept = None
 
   hf_token = hf_token or env('HF_ACCESS_TOKEN')
 
+  if not skip_data_upload:
+    for d in dataset:
+      upload(
+        dataset=d,
+        project_dir=project_dir,
+        url_or_repo=get_hf_dataset_repo_id(*hf_space.split('/'), *d.split('/')),
+        public=make_datasets_public,
+        hf_token=hf_token,
+      )
+
   deploy_project(
     project_dir=project_dir,
     hf_space=hf_space,
-    datasets=dataset,
     concepts=concept,
-    make_datasets_public=make_datasets_public,
-    skip_data_upload=skip_data_upload,
     skip_concept_upload=skip_concept_upload,
     create_space=create_space,
     load_on_space=load_on_space,

diff --git a/lilac/config.py b/lilac/config.py
@@ -212,7 +212,7 @@ class Config(BaseModel):
 
   # When defined, uses this list of signals to run over every dataset, over all media paths, unless
   # signals is overridden by a specific dataset.
-  signals: list[Signal] = PydanticField(
+  signals: SerializeAsAny[list[Signal]] = PydanticField(
     description='The signals to run for every dataset.', default=[]
   )
 

diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py
@@ -469,9 +469,6 @@ def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]:
       ),
     )
 
-  if task_id:
-    task_manager.set_completed(task_id)
-
 
 def _hdbscan_cluster(
   docs: Iterator[str],

diff --git a/lilac/deploy.py b/lilac/deploy.py
@@ -12,7 +12,6 @@
 
 from .concepts.db_concept import CONCEPTS_DIR, DiskConceptDB, get_concept_output_dir
 from .config import Config
-from .data.dataset_storage_utils import upload
 from .env import get_project_dir
 from .project import PROJECT_CONFIG_FILENAME, read_project_config, write_project_config
 from .utils import get_hf_dataset_repo_id, get_lilac_cache_dir, log, to_yaml
@@ -27,11 +26,9 @@
 
 def deploy_project(
   hf_space: str,
+  project_config: Optional[Config] = None,
   project_dir: Optional[str] = None,
-  datasets: Optional[list[str]] = None,
-  make_datasets_public: Optional[bool] = False,
   concepts: Optional[list[str]] = None,
-  skip_data_upload: Optional[bool] = False,
   skip_concept_upload: Optional[bool] = False,
   create_space: Optional[bool] = False,
   load_on_space: Optional[bool] = False,
@@ -42,12 +39,9 @@ def deploy_project(
 
   Args:
     hf_space: The huggingface space. Should be formatted like `SPACE_ORG/SPACE_NAME`.
-    project_dir: The project directory to use for the demo. Defaults to `env.LILAC_PROJECT_DIR`.
-    datasets: The names of datasets to upload. Defaults to all datasets.
-    make_datasets_public: When true, and when --load_on_space is False, sets the HuggingFace
-      datasets that reflect local datasets to public. Defaults to false.
+    project_config: A project config for the space; defaults to config file found in project_dir.
+    project_dir: The project directory to grab data from. Defaults to `env.LILAC_PROJECT_DIR`.
     concepts: The names of concepts to upload. Defaults to all concepts.
-    skip_data_upload: When true, kicks the server without uploading data.
     skip_concept_upload: When true, skips uploading concepts.
     create_space: When True, creates the HuggingFace space if it doesnt exist. The space will be
       created with the storage type defined by --hf_space_storage.
@@ -79,10 +73,8 @@ def deploy_project(
     hf_api=hf_api,
     project_dir=project_dir,
     hf_space=hf_space,
-    datasets=datasets,
-    make_datasets_public=make_datasets_public,
+    project_config=project_config,
     concepts=concepts,
-    skip_data_upload=skip_data_upload,
     skip_concept_upload=skip_concept_upload,
     create_space=create_space,
     load_on_space=load_on_space,
@@ -106,16 +98,15 @@ def deploy_project_operations(
   hf_api: 'HfApi',
   project_dir: str,
   hf_space: str,
-  datasets: Optional[list[str]] = None,
-  make_datasets_public: Optional[bool] = False,
+  project_config: Optional[Config] = None,
   concepts: Optional[list[str]] = None,
-  skip_data_upload: Optional[bool] = False,
   skip_concept_upload: Optional[bool] = False,
   create_space: Optional[bool] = False,
   load_on_space: Optional[bool] = False,
   hf_space_storage: Optional[Union[Literal['small'], Literal['medium'], Literal['large']]] = None,
 ) -> list:
   """The commit operations for a project deployment."""
+  project_config = project_config or read_project_config(project_dir)
   try:
     from huggingface_hub import CommitOperationAdd, CommitOperationDelete
     from huggingface_hub.utils._errors import RepositoryNotFoundError
@@ -178,72 +169,49 @@ def deploy_project_operations(
   operations.extend(_make_wheel_dir(hf_api, hf_space))
 
   ##
-  ##  Upload datasets.
+  ##  Upload the HuggingFace application file (README.md) with uploaded datasets.
   ##
-  project_config = read_project_config(project_dir)
-  # When datasets aren't explicitly defined, read all datasets and upload them.
-  if datasets is None:
-    datasets = [f'{d.namespace}/{d.name}' for d in project_config.datasets]
-
-  if not skip_data_upload and not load_on_space:
-    lilac_hf_datasets = _upload_datasets(
-      api=hf_api,
-      project_dir=project_dir,
-      hf_space=hf_space,
-      datasets=datasets,
-      make_datasets_public=make_datasets_public,
-    )
-  else:
-    lilac_hf_datasets = []
-
-  ##
-  ##  Upload the HuggingFace application file (README.md) with uploaded datasets so they are synced
-  ##  to storage when the docker image boots up.
-  ##
-  if (lilac_hf_datasets and not skip_data_upload) or load_on_space:
-    readme = (
-      '---\n'
-      + to_yaml(
-        {
-          'title': 'Lilac',
-          'emoji': '🌷',
-          'colorFrom': 'purple',
-          'colorTo': 'purple',
-          'sdk': 'docker',
-          'app_port': 5432,
-          'datasets': [d for d in lilac_hf_datasets],
-        }
-      )
-      + '\n---'
+  hf_space_org, hf_space_name = hf_space.split('/')
+  dataset_repos = [
+    get_hf_dataset_repo_id(hf_space_org, hf_space_name, d.namespace, d.name)
+    for d in project_config.datasets
+  ]
+  readme = (
+    '---\n'
+    + to_yaml(
+      {
+        'title': 'Lilac',
+        'emoji': '🌷',
+        'colorFrom': 'purple',
+        'colorTo': 'purple',
+        'sdk': 'docker',
+        'app_port': 5432,
+        'datasets': dataset_repos,
+      }
     )
-    readme_filename = 'README.md'
-    if hf_api.file_exists(hf_space, readme_filename, repo_type='space'):
-      operations.append(CommitOperationDelete(path_in_repo=readme_filename))
+    + '\n---'
+  )
+  readme_filename = 'README.md'
+  if hf_api.file_exists(hf_space, readme_filename, repo_type='space'):
+    operations.append(CommitOperationDelete(path_in_repo=readme_filename))
 
-    operations.append(
-      CommitOperationAdd(path_in_repo=readme_filename, path_or_fileobj=readme.encode())
-    )
+  operations.append(
+    CommitOperationAdd(path_in_repo=readme_filename, path_or_fileobj=readme.encode())
+  )
   ##
   ##  Upload the lilac.yml project configuration.
   ##
-  if datasets and not skip_data_upload:
-    project_config_filename = f'data/{PROJECT_CONFIG_FILENAME}'
-    # Filter datasets that aren't explicitly defined.
-    project_config.datasets = [
-      dataset
-      for dataset in project_config.datasets
-      if f'{dataset.namespace}/{dataset.name}' in datasets
-    ]
-    if hf_api.file_exists(hf_space, project_config_filename, repo_type='space'):
-      operations.append(CommitOperationDelete(path_in_repo=project_config_filename))
-    operations.append(
-      CommitOperationAdd(
-        path_in_repo=project_config_filename,
-        path_or_fileobj=to_yaml(
-          project_config.model_dump(exclude_defaults=True, exclude_none=True, exclude_unset=True)
-        ).encode(),
-      )
+  project_config_filename = f'data/{PROJECT_CONFIG_FILENAME}'
+  if hf_api.file_exists(hf_space, project_config_filename, repo_type='space'):
+    operations.append(CommitOperationDelete(path_in_repo=project_config_filename))
+  operations.append(
+    CommitOperationAdd(
+      path_in_repo=project_config_filename,
+      path_or_fileobj=to_yaml(
+        project_config.model_dump(exclude_defaults=True, exclude_none=True, exclude_unset=True)
+      ).encode(),
     )
+  )
 
   ##
   ##  Upload concepts.
@@ -416,50 +384,6 @@ def _upload_concepts(
   return operations, concepts
 
 
-def _upload_datasets(
-  api: Any,
-  project_dir: str,
-  hf_space: str,
-  datasets: list[str],
-  make_datasets_public: Optional[bool] = False,
-) -> list[str]:
-  """Uploads local datasets to HuggingFace datasets."""
-  if not make_datasets_public:
-    make_datasets_public = False
-  try:
-    from huggingface_hub import HfApi
-
-  except ImportError:
-    raise ImportError(
-      'Could not import the "huggingface_hub" python package. '
-      'Please install it with `pip install "huggingface_hub".'
-    )
-  hf_api: HfApi = api
-
-  hf_space_org, hf_space_name = hf_space.split('/')
-
-  log('Uploading datasets: ', datasets)
-
-  lilac_hf_datasets: list[str] = []
-  # Upload datasets to HuggingFace. We do this after uploading code to avoid clobbering the data
-  # directory.
-  # NOTE(nsthorat): This currently doesn't write to persistent storage directly.
-  for d in datasets:
-    namespace, name = d.split('/')
-    dataset_repo_id = get_hf_dataset_repo_id(hf_space_org, hf_space_name, namespace, name)
-
-    upload(
-      dataset=d,
-      project_dir=project_dir,
-      url_or_repo=dataset_repo_id,
-      public=make_datasets_public,
-      hf_token=hf_api.token,
-    )
-
-    lilac_hf_datasets.append(dataset_repo_id)
-  return lilac_hf_datasets
-
-
 def deploy_config(
   hf_space: str,
   config: Config,

diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py
@@ -58,9 +58,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     cohere_input_type = 'search_document' if self.embed_input_type == 'document' else 'search_query'
 
     def _embed_fn(docs: list[str]) -> list[np.ndarray]:
-      return self._model.embed(
-        docs, truncate='END', model=COHERE_EMBED_MODEL, input_type=cohere_input_type
-      ).embeddings
+      return [
+        np.array(e)
+        for e in self._model.embed(
+          docs, truncate='END', model=COHERE_EMBED_MODEL, input_type=cohere_input_type
+        ).embeddings
+      ]
 
     return chunked_compute_embedding(
       _embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker

diff --git a/lilac/embeddings/embedding.py b/lilac/embeddings/embedding.py
@@ -47,7 +47,7 @@ def _embed_fn(data: Iterable[RichData]) -> Iterable[list[SpanVector]]:
 
     for item in items:
       if not item:
-        raise ValueError('Embedding signal returned None.')
+        raise ValueError('Embedding signal returned None.', embedding)
 
       yield [
         {

diff --git a/lilac/load.py b/lilac/load.py
@@ -14,7 +14,7 @@
 from .env import get_project_dir
 from .load_dataset import process_source
 from .project import PROJECT_CONFIG_FILENAME
-from .schema import ROWID, PathTuple
+from .schema import PathTuple
 from .utils import DebugTimer, get_datasets_dir, log
 
 
@@ -78,7 +78,7 @@ def load(
   total_num_rows = 0
   for d in datasets_to_load:
     dataset = DatasetDuckDB(d.namespace, d.name, project_dir=project_dir)
-    num_rows = dataset.select_rows([ROWID], limit=1).total_num_rows
+    num_rows = dataset.count(query_options=None)
     log(f'{d.namespace}/{d.name} loaded with {num_rows:,} rows.')
 
     # Free up RAM.
@@ -163,6 +163,11 @@ def load(
   log('*** Compute clusters ***')
   with DebugTimer('Computing clusters'):
     for c in config.clusters:
+      if not any(
+        c.dataset_namespace == d.namespace and c.dataset_name == d.name for d in config.datasets
+      ):
+        print('Skipping cluster for non-existent dataset:', c)
+        continue
       dataset = DatasetDuckDB(c.dataset_namespace, c.dataset_name, project_dir=project_dir)
       manifest = dataset.manifest()
       schema = manifest.data_schema

diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
@@ -92,6 +92,22 @@ datasets:
         path:
           - response
 
+  - namespace: lilac
+    name: dolphin
+    source:
+      dataset_name: cognitivecomputations/dolphin
+      config_name: flan1m-alpaca-uncensored
+      source_name: huggingface
+    settings:
+      ui:
+        media_paths:
+          - instruction
+          - input
+          - output
+          - - input__cluster
+            - text
+        markdown_paths: []
+
 signals:
   - signal_name: pii
   - signal_name: text_statistics
@@ -127,3 +143,7 @@ clusters:
     dataset_name: OpenOrca-100k
     input_path:
       - question
+  - dataset_namespace: lilac
+    dataset_name: dolphin
+    input_path:
+      - input