diff --git a/utilization/load_dataset.py b/utilization/load_dataset.py index 78f8da37..deea6d54 100644 --- a/utilization/load_dataset.py +++ b/utilization/load_dataset.py @@ -112,6 +112,7 @@ def get_subsets( paths = [str(get_script_path(cache_path))] + paths found_config = False + s = None for path in paths: if path is None: continue @@ -121,7 +122,11 @@ def get_subsets( found_config = True break except Exception as e: - logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}. Trying another method...") + logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}") + if s is None: + raise RuntimeError( + f"Failed to get dataset config names for {dataset_cls}. Please check the dataset path or the internet connection. If you have problem connecting to HuggingFace, you can set `--hf_mirror` to use a mirror site." + ) logger.debug(f"get_dataset_config_names({path}): {s}") @@ -238,7 +243,7 @@ def load_dataset( dataset_classes = import_dataset_classes(dataset_name) cache_paths = [] for dcls in dataset_classes: - if dcls.load_args is None: + if not isinstance(dcls.load_args, tuple): continue elif len(dcls.load_args) > 0: cache_paths.append( @@ -247,7 +252,8 @@ def load_dataset( args.hfd_cache_path, hf_username=evaluation_args.hf_username, hf_token=evaluation_args.hf_token, - mirror=args.hf_mirror + mirror=args.hf_mirror, + evaluation_args=evaluation_args, ) ) else: @@ -258,7 +264,8 @@ def load_dataset( args.hfd_cache_path, hf_username=evaluation_args.hf_username, hf_token=evaluation_args.hf_token, - mirror=args.hf_mirror + mirror=args.hf_mirror, + evaluation_args=evaluation_args, ) ) available_subsets_by_cls = get_subsets(dataset_name, dataset_classes, args, cache_paths) diff --git a/utilization/utils/arguments.py b/utilization/utils/arguments.py index fcd6e70d..9212ac44 100644 --- a/utilization/utils/arguments.py +++ b/utilization/utils/arguments.py @@ -11,7 +11,6 @@ import tiktoken -from ..chat_templates import DEFAULT_CHAT_CONFIGS from ..dataset_enum import DEFAULT_VLLM_DATASETS from ..model_enum import ( ANTHROPIC_CHAT_COMPLETIONS_ARGS, API_MODELS, DASHSCOPE_CHAT_COMPLETIONS_ARGS, HUGGINGFACE_ARGS, @@ -488,7 +487,6 @@ class DatasetArguments: ) continue_from: ClassVar[int] = 0 - """The number of instances (lines) in .json file to resume from. This is set in `PredictionWriter.write_metainfo`.""" # set in `set_logging` with format "{evaluation_results_dir}/{log_filename}.json" evaluation_results_path: ClassVar[Optional[str]] = None @@ -515,6 +513,9 @@ def __post_init__(self): f"Invalid batch size: {self.batch_size}. Specify an integer (e.g., '10') to use a fixed batch size for all iterations. Alternatively, append ':auto' (e.g., '10:auto') to start with the specified batch size and automatically adjust it in subsequent iterations to maintain constant CUDA memory usage" ) + if self.hf_mirror: + os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" + @dataclass class EvaluationArguments: @@ -572,6 +573,18 @@ class EvaluationArguments: default=False, help="Whether to skip metrics evaluation and only do inference", ) + hfd_exclude_pattern: str = HfArg( + default=None, + help="The exclude pattern for datasets downloaded with hfd.sh", + ) + hfd_include_pattern: str = HfArg( + default=None, + help="The include pattern for datasets downloaded with hfd.sh", + ) + hfd_skip_check: bool = HfArg( + default=False, + help="Whether to skip the check of hfd.sh", + ) _redact = {"hf_token"} diff --git a/utilization/utils/hfd.py b/utilization/utils/hfd.py index 80e5c4fc..b4685ee6 100644 --- a/utilization/utils/hfd.py +++ b/utilization/utils/hfd.py @@ -41,6 +41,7 @@ def huggingface_download( hf_token: Optional[str] = None, old: str = "https://huggingface.co", new: str = "https://hf-mirror.com", + evaluation_args=None, ) -> Optional[str]: """Download a dataset from Hugging Face Hub to a local directory using hfd.sh.""" @@ -62,14 +63,23 @@ def huggingface_download( logger.debug(f"Downloading {path} to {repo_path}") mirror_flag = " --mirror" if mirror else "" + if hf_username and hf_token: auth = f" --hf_username {hf_username} --hf_token {hf_token}" elif hf_token: auth = f" --hf_token {hf_token}" else: auth = "" + + if evaluation_args is not None: + if evaluation_args.hfd_exclude_pattern is not None: + auth += f" --exclude-pattern {evaluation_args.hfd_exclude_pattern}" + if evaluation_args.hfd_include_pattern is not None: + auth += f" --include-pattern {evaluation_args.hfd_include_pattern}" + command = f"bash {hfd_cli.as_posix()} {path} --dataset --local-dir {repo_path.as_posix()}{auth}" - os.system(command + mirror_flag) + if not evaluation_args.hfd_skip_check: + os.system(command + mirror_flag) update_script(load_script_path, mirror, old, new)