Skip to content

Commit

Permalink
Deploy dataset (#1085)
Browse files Browse the repository at this point in the history
deploy_project(_operations) no longer does any dataset uploading; that
is expected to be handled by the caller. Deploy is purely about
uploading the project metadata and configuring the hf space.

There are four entry points to deploy_project_operations to
double-check:
- CLI; not sure how this is used
- deploy_demo; this has an explicit project config at lilac_hf_space.yml
that should be the source of truth, even if we only fiddle with one
dataset at a time
- deploy_staging; this has no explicit project config and has to be
dynamically constructed based on the dataset that gets uploaded
- deployer app; this constructs a project config and does no data upload
(relies on the hf docker start's load() function to download from HF)
  • Loading branch information
brilee authored Jan 19, 2024
1 parent 4a1c4d3 commit 1ecef77
Show file tree
Hide file tree
Showing 10 changed files with 159 additions and 163 deletions.
23 changes: 16 additions & 7 deletions lilac/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from .env import env, get_project_dir
from .hf_docker_start import hf_docker_start
from .load import load
from .project import dir_is_project, init, project_dir_from_args
from .project import dir_is_project, init, project_dir_from_args, read_project_config
from .server import start_server
from .utils import get_hf_dataset_repo_id


@click.command()
Expand Down Expand Up @@ -185,22 +186,30 @@ def deploy_project_command(
hf_token: Optional[str],
) -> None:
"""Deploy a project directory to a HuggingFace Space."""
# When datasets aren't define, set to None so we upload all datasets.
if not dataset:
dataset = None
# When datasets aren't defined, upload all datasets by default.
if dataset is None:
project_config = read_project_config(project_dir)
dataset = [f'{d.namespace}/{d.name}' for d in project_config.datasets]
# When datasets aren't defined, set to None so we upload all datasets.
if not concept:
concept = None

hf_token = hf_token or env('HF_ACCESS_TOKEN')

if not skip_data_upload:
for d in dataset:
upload(
dataset=d,
project_dir=project_dir,
url_or_repo=get_hf_dataset_repo_id(*hf_space.split('/'), *d.split('/')),
public=make_datasets_public,
hf_token=hf_token,
)

deploy_project(
project_dir=project_dir,
hf_space=hf_space,
datasets=dataset,
concepts=concept,
make_datasets_public=make_datasets_public,
skip_data_upload=skip_data_upload,
skip_concept_upload=skip_concept_upload,
create_space=create_space,
load_on_space=load_on_space,
Expand Down
2 changes: 1 addition & 1 deletion lilac/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ class Config(BaseModel):

# When defined, uses this list of signals to run over every dataset, over all media paths, unless
# signals is overridden by a specific dataset.
signals: list[Signal] = PydanticField(
signals: SerializeAsAny[list[Signal]] = PydanticField(
description='The signals to run for every dataset.', default=[]
)

Expand Down
3 changes: 0 additions & 3 deletions lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,9 +469,6 @@ def compute_category_titles(items: Iterator[Item]) -> Iterator[Item]:
),
)

if task_id:
task_manager.set_completed(task_id)


def _hdbscan_cluster(
docs: Iterator[str],
Expand Down
160 changes: 42 additions & 118 deletions lilac/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from .concepts.db_concept import CONCEPTS_DIR, DiskConceptDB, get_concept_output_dir
from .config import Config
from .data.dataset_storage_utils import upload
from .env import get_project_dir
from .project import PROJECT_CONFIG_FILENAME, read_project_config, write_project_config
from .utils import get_hf_dataset_repo_id, get_lilac_cache_dir, log, to_yaml
Expand All @@ -27,11 +26,9 @@

def deploy_project(
hf_space: str,
project_config: Optional[Config] = None,
project_dir: Optional[str] = None,
datasets: Optional[list[str]] = None,
make_datasets_public: Optional[bool] = False,
concepts: Optional[list[str]] = None,
skip_data_upload: Optional[bool] = False,
skip_concept_upload: Optional[bool] = False,
create_space: Optional[bool] = False,
load_on_space: Optional[bool] = False,
Expand All @@ -42,12 +39,9 @@ def deploy_project(
Args:
hf_space: The huggingface space. Should be formatted like `SPACE_ORG/SPACE_NAME`.
project_dir: The project directory to use for the demo. Defaults to `env.LILAC_PROJECT_DIR`.
datasets: The names of datasets to upload. Defaults to all datasets.
make_datasets_public: When true, and when --load_on_space is False, sets the HuggingFace
datasets that reflect local datasets to public. Defaults to false.
project_config: A project config for the space; defaults to config file found in project_dir.
project_dir: The project directory to grab data from. Defaults to `env.LILAC_PROJECT_DIR`.
concepts: The names of concepts to upload. Defaults to all concepts.
skip_data_upload: When true, kicks the server without uploading data.
skip_concept_upload: When true, skips uploading concepts.
create_space: When True, creates the HuggingFace space if it doesnt exist. The space will be
created with the storage type defined by --hf_space_storage.
Expand Down Expand Up @@ -79,10 +73,8 @@ def deploy_project(
hf_api=hf_api,
project_dir=project_dir,
hf_space=hf_space,
datasets=datasets,
make_datasets_public=make_datasets_public,
project_config=project_config,
concepts=concepts,
skip_data_upload=skip_data_upload,
skip_concept_upload=skip_concept_upload,
create_space=create_space,
load_on_space=load_on_space,
Expand All @@ -106,16 +98,15 @@ def deploy_project_operations(
hf_api: 'HfApi',
project_dir: str,
hf_space: str,
datasets: Optional[list[str]] = None,
make_datasets_public: Optional[bool] = False,
project_config: Optional[Config] = None,
concepts: Optional[list[str]] = None,
skip_data_upload: Optional[bool] = False,
skip_concept_upload: Optional[bool] = False,
create_space: Optional[bool] = False,
load_on_space: Optional[bool] = False,
hf_space_storage: Optional[Union[Literal['small'], Literal['medium'], Literal['large']]] = None,
) -> list:
"""The commit operations for a project deployment."""
project_config = project_config or read_project_config(project_dir)
try:
from huggingface_hub import CommitOperationAdd, CommitOperationDelete
from huggingface_hub.utils._errors import RepositoryNotFoundError
Expand Down Expand Up @@ -178,72 +169,49 @@ def deploy_project_operations(
operations.extend(_make_wheel_dir(hf_api, hf_space))

##
## Upload datasets.
## Upload the HuggingFace application file (README.md) with uploaded datasets.
##
project_config = read_project_config(project_dir)
# When datasets aren't explicitly defined, read all datasets and upload them.
if datasets is None:
datasets = [f'{d.namespace}/{d.name}' for d in project_config.datasets]

if not skip_data_upload and not load_on_space:
lilac_hf_datasets = _upload_datasets(
api=hf_api,
project_dir=project_dir,
hf_space=hf_space,
datasets=datasets,
make_datasets_public=make_datasets_public,
)
else:
lilac_hf_datasets = []

##
## Upload the HuggingFace application file (README.md) with uploaded datasets so they are synced
## to storage when the docker image boots up.
##
if (lilac_hf_datasets and not skip_data_upload) or load_on_space:
readme = (
'---\n'
+ to_yaml(
{
'title': 'Lilac',
'emoji': '🌷',
'colorFrom': 'purple',
'colorTo': 'purple',
'sdk': 'docker',
'app_port': 5432,
'datasets': [d for d in lilac_hf_datasets],
}
)
+ '\n---'
hf_space_org, hf_space_name = hf_space.split('/')
dataset_repos = [
get_hf_dataset_repo_id(hf_space_org, hf_space_name, d.namespace, d.name)
for d in project_config.datasets
]
readme = (
'---\n'
+ to_yaml(
{
'title': 'Lilac',
'emoji': '🌷',
'colorFrom': 'purple',
'colorTo': 'purple',
'sdk': 'docker',
'app_port': 5432,
'datasets': dataset_repos,
}
)
readme_filename = 'README.md'
if hf_api.file_exists(hf_space, readme_filename, repo_type='space'):
operations.append(CommitOperationDelete(path_in_repo=readme_filename))
+ '\n---'
)
readme_filename = 'README.md'
if hf_api.file_exists(hf_space, readme_filename, repo_type='space'):
operations.append(CommitOperationDelete(path_in_repo=readme_filename))

operations.append(
CommitOperationAdd(path_in_repo=readme_filename, path_or_fileobj=readme.encode())
)
operations.append(
CommitOperationAdd(path_in_repo=readme_filename, path_or_fileobj=readme.encode())
)
##
## Upload the lilac.yml project configuration.
##
if datasets and not skip_data_upload:
project_config_filename = f'data/{PROJECT_CONFIG_FILENAME}'
# Filter datasets that aren't explicitly defined.
project_config.datasets = [
dataset
for dataset in project_config.datasets
if f'{dataset.namespace}/{dataset.name}' in datasets
]
if hf_api.file_exists(hf_space, project_config_filename, repo_type='space'):
operations.append(CommitOperationDelete(path_in_repo=project_config_filename))
operations.append(
CommitOperationAdd(
path_in_repo=project_config_filename,
path_or_fileobj=to_yaml(
project_config.model_dump(exclude_defaults=True, exclude_none=True, exclude_unset=True)
).encode(),
)
project_config_filename = f'data/{PROJECT_CONFIG_FILENAME}'
if hf_api.file_exists(hf_space, project_config_filename, repo_type='space'):
operations.append(CommitOperationDelete(path_in_repo=project_config_filename))
operations.append(
CommitOperationAdd(
path_in_repo=project_config_filename,
path_or_fileobj=to_yaml(
project_config.model_dump(exclude_defaults=True, exclude_none=True, exclude_unset=True)
).encode(),
)
)

##
## Upload concepts.
Expand Down Expand Up @@ -416,50 +384,6 @@ def _upload_concepts(
return operations, concepts


def _upload_datasets(
api: Any,
project_dir: str,
hf_space: str,
datasets: list[str],
make_datasets_public: Optional[bool] = False,
) -> list[str]:
"""Uploads local datasets to HuggingFace datasets."""
if not make_datasets_public:
make_datasets_public = False
try:
from huggingface_hub import HfApi

except ImportError:
raise ImportError(
'Could not import the "huggingface_hub" python package. '
'Please install it with `pip install "huggingface_hub".'
)
hf_api: HfApi = api

hf_space_org, hf_space_name = hf_space.split('/')

log('Uploading datasets: ', datasets)

lilac_hf_datasets: list[str] = []
# Upload datasets to HuggingFace. We do this after uploading code to avoid clobbering the data
# directory.
# NOTE(nsthorat): This currently doesn't write to persistent storage directly.
for d in datasets:
namespace, name = d.split('/')
dataset_repo_id = get_hf_dataset_repo_id(hf_space_org, hf_space_name, namespace, name)

upload(
dataset=d,
project_dir=project_dir,
url_or_repo=dataset_repo_id,
public=make_datasets_public,
hf_token=hf_api.token,
)

lilac_hf_datasets.append(dataset_repo_id)
return lilac_hf_datasets


def deploy_config(
hf_space: str,
config: Config,
Expand Down
9 changes: 6 additions & 3 deletions lilac/embeddings/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
cohere_input_type = 'search_document' if self.embed_input_type == 'document' else 'search_query'

def _embed_fn(docs: list[str]) -> list[np.ndarray]:
return self._model.embed(
docs, truncate='END', model=COHERE_EMBED_MODEL, input_type=cohere_input_type
).embeddings
return [
np.array(e)
for e in self._model.embed(
docs, truncate='END', model=COHERE_EMBED_MODEL, input_type=cohere_input_type
).embeddings
]

return chunked_compute_embedding(
_embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
Expand Down
2 changes: 1 addition & 1 deletion lilac/embeddings/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _embed_fn(data: Iterable[RichData]) -> Iterable[list[SpanVector]]:

for item in items:
if not item:
raise ValueError('Embedding signal returned None.')
raise ValueError('Embedding signal returned None.', embedding)

yield [
{
Expand Down
9 changes: 7 additions & 2 deletions lilac/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .env import get_project_dir
from .load_dataset import process_source
from .project import PROJECT_CONFIG_FILENAME
from .schema import ROWID, PathTuple
from .schema import PathTuple
from .utils import DebugTimer, get_datasets_dir, log


Expand Down Expand Up @@ -78,7 +78,7 @@ def load(
total_num_rows = 0
for d in datasets_to_load:
dataset = DatasetDuckDB(d.namespace, d.name, project_dir=project_dir)
num_rows = dataset.select_rows([ROWID], limit=1).total_num_rows
num_rows = dataset.count(query_options=None)
log(f'{d.namespace}/{d.name} loaded with {num_rows:,} rows.')

# Free up RAM.
Expand Down Expand Up @@ -163,6 +163,11 @@ def load(
log('*** Compute clusters ***')
with DebugTimer('Computing clusters'):
for c in config.clusters:
if not any(
c.dataset_namespace == d.namespace and c.dataset_name == d.name for d in config.datasets
):
print('Skipping cluster for non-existent dataset:', c)
continue
dataset = DatasetDuckDB(c.dataset_namespace, c.dataset_name, project_dir=project_dir)
manifest = dataset.manifest()
schema = manifest.data_schema
Expand Down
20 changes: 20 additions & 0 deletions lilac_hf_space.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ datasets:
path:
- response

- namespace: lilac
name: dolphin
source:
dataset_name: cognitivecomputations/dolphin
config_name: flan1m-alpaca-uncensored
source_name: huggingface
settings:
ui:
media_paths:
- instruction
- input
- output
- - input__cluster
- text
markdown_paths: []

signals:
- signal_name: pii
- signal_name: text_statistics
Expand Down Expand Up @@ -127,3 +143,7 @@ clusters:
dataset_name: OpenOrca-100k
input_path:
- question
- dataset_namespace: lilac
dataset_name: dolphin
input_path:
- input
Loading

0 comments on commit 1ecef77

Please sign in to comment.