Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] add hfd flags #281

Merged
merged 1 commit into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions utilization/load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def get_subsets(
paths = [str(get_script_path(cache_path))] + paths

found_config = False
s = None
for path in paths:
if path is None:
continue
Expand All @@ -121,7 +122,11 @@ def get_subsets(
found_config = True
break
except Exception as e:
logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}. Trying another method...")
logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}")
if s is None:
raise RuntimeError(
f"Failed to get dataset config names for {dataset_cls}. Please check the dataset path or the internet connection. If you have problem connecting to HuggingFace, you can set `--hf_mirror` to use a mirror site."
)

logger.debug(f"get_dataset_config_names({path}): {s}")

Expand Down Expand Up @@ -238,7 +243,7 @@ def load_dataset(
dataset_classes = import_dataset_classes(dataset_name)
cache_paths = []
for dcls in dataset_classes:
if dcls.load_args is None:
if not isinstance(dcls.load_args, tuple):
continue
elif len(dcls.load_args) > 0:
cache_paths.append(
Expand All @@ -247,7 +252,8 @@ def load_dataset(
args.hfd_cache_path,
hf_username=evaluation_args.hf_username,
hf_token=evaluation_args.hf_token,
mirror=args.hf_mirror
mirror=args.hf_mirror,
evaluation_args=evaluation_args,
)
)
else:
Expand All @@ -258,7 +264,8 @@ def load_dataset(
args.hfd_cache_path,
hf_username=evaluation_args.hf_username,
hf_token=evaluation_args.hf_token,
mirror=args.hf_mirror
mirror=args.hf_mirror,
evaluation_args=evaluation_args,
)
)
available_subsets_by_cls = get_subsets(dataset_name, dataset_classes, args, cache_paths)
Expand Down
17 changes: 15 additions & 2 deletions utilization/utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import tiktoken

from ..chat_templates import DEFAULT_CHAT_CONFIGS
from ..dataset_enum import DEFAULT_VLLM_DATASETS
from ..model_enum import (
ANTHROPIC_CHAT_COMPLETIONS_ARGS, API_MODELS, DASHSCOPE_CHAT_COMPLETIONS_ARGS, HUGGINGFACE_ARGS,
Expand Down Expand Up @@ -488,7 +487,6 @@ class DatasetArguments:
)

continue_from: ClassVar[int] = 0
"""The number of instances (lines) in .json file to resume from. This is set in `PredictionWriter.write_metainfo`."""

# set in `set_logging` with format "{evaluation_results_dir}/{log_filename}.json"
evaluation_results_path: ClassVar[Optional[str]] = None
Expand All @@ -515,6 +513,9 @@ def __post_init__(self):
f"Invalid batch size: {self.batch_size}. Specify an integer (e.g., '10') to use a fixed batch size for all iterations. Alternatively, append ':auto' (e.g., '10:auto') to start with the specified batch size and automatically adjust it in subsequent iterations to maintain constant CUDA memory usage"
)

if self.hf_mirror:
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"


@dataclass
class EvaluationArguments:
Expand Down Expand Up @@ -572,6 +573,18 @@ class EvaluationArguments:
default=False,
help="Whether to skip metrics evaluation and only do inference",
)
hfd_exclude_pattern: str = HfArg(
default=None,
help="The exclude pattern for datasets downloaded with hfd.sh",
)
hfd_include_pattern: str = HfArg(
default=None,
help="The include pattern for datasets downloaded with hfd.sh",
)
hfd_skip_check: bool = HfArg(
default=False,
help="Whether to skip the check of hfd.sh",
)

_redact = {"hf_token"}

Expand Down
12 changes: 11 additions & 1 deletion utilization/utils/hfd.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def huggingface_download(
hf_token: Optional[str] = None,
old: str = "https://huggingface.co",
new: str = "https://hf-mirror.com",
evaluation_args=None,
) -> Optional[str]:
"""Download a dataset from Hugging Face Hub to a local directory using hfd.sh."""

Expand All @@ -62,14 +63,23 @@ def huggingface_download(
logger.debug(f"Downloading {path} to {repo_path}")

mirror_flag = " --mirror" if mirror else ""

if hf_username and hf_token:
auth = f" --hf_username {hf_username} --hf_token {hf_token}"
elif hf_token:
auth = f" --hf_token {hf_token}"
else:
auth = ""

if evaluation_args is not None:
if evaluation_args.hfd_exclude_pattern is not None:
auth += f" --exclude-pattern {evaluation_args.hfd_exclude_pattern}"
if evaluation_args.hfd_include_pattern is not None:
auth += f" --include-pattern {evaluation_args.hfd_include_pattern}"

command = f"bash {hfd_cli.as_posix()} {path} --dataset --local-dir {repo_path.as_posix()}{auth}"
os.system(command + mirror_flag)
if not evaluation_args.hfd_skip_check:
os.system(command + mirror_flag)

update_script(load_script_path, mirror, old, new)

Expand Down
Loading