From fbe273a2f47ca451b2eaa886d1d5c96d3fde0acb Mon Sep 17 00:00:00 2001
From: huyiwen <1020030101@qq.com>
Date: Fri, 16 Aug 2024 18:33:59 +0800
Subject: [PATCH] [feat] add hfd flags

---
 utilization/load_dataset.py    | 15 +++++++++++----
 utilization/utils/arguments.py | 17 +++++++++++++++--
 utilization/utils/hfd.py       | 12 +++++++++++-
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/utilization/load_dataset.py b/utilization/load_dataset.py
index 78f8da37..deea6d54 100644
--- a/utilization/load_dataset.py
+++ b/utilization/load_dataset.py
@@ -112,6 +112,7 @@ def get_subsets(
             paths = [str(get_script_path(cache_path))] + paths
 
         found_config = False
+        s = None
         for path in paths:
             if path is None:
                 continue
@@ -121,7 +122,11 @@ def get_subsets(
                 found_config = True
                 break
             except Exception as e:
-                logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}. Trying another method...")
+                logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}")
+        if s is None:
+            raise RuntimeError(
+                f"Failed to get dataset config names for {dataset_cls}. Please check the dataset path or the internet connection. If you have problem connecting to HuggingFace, you can set `--hf_mirror` to use a mirror site."
+            )
 
         logger.debug(f"get_dataset_config_names({path}): {s}")
 
@@ -238,7 +243,7 @@ def load_dataset(
     dataset_classes = import_dataset_classes(dataset_name)
     cache_paths = []
     for dcls in dataset_classes:
-        if dcls.load_args is None:
+        if not isinstance(dcls.load_args, tuple):
             continue
         elif len(dcls.load_args) > 0:
             cache_paths.append(
@@ -247,7 +252,8 @@ def load_dataset(
                     args.hfd_cache_path,
                     hf_username=evaluation_args.hf_username,
                     hf_token=evaluation_args.hf_token,
-                    mirror=args.hf_mirror
+                    mirror=args.hf_mirror,
+                    evaluation_args=evaluation_args,
                 )
             )
         else:
@@ -258,7 +264,8 @@ def load_dataset(
                     args.hfd_cache_path,
                     hf_username=evaluation_args.hf_username,
                     hf_token=evaluation_args.hf_token,
-                    mirror=args.hf_mirror
+                    mirror=args.hf_mirror,
+                    evaluation_args=evaluation_args,
                 )
             )
     available_subsets_by_cls = get_subsets(dataset_name, dataset_classes, args, cache_paths)
diff --git a/utilization/utils/arguments.py b/utilization/utils/arguments.py
index fcd6e70d..9212ac44 100644
--- a/utilization/utils/arguments.py
+++ b/utilization/utils/arguments.py
@@ -11,7 +11,6 @@
 
 import tiktoken
 
-from ..chat_templates import DEFAULT_CHAT_CONFIGS
 from ..dataset_enum import DEFAULT_VLLM_DATASETS
 from ..model_enum import (
     ANTHROPIC_CHAT_COMPLETIONS_ARGS, API_MODELS, DASHSCOPE_CHAT_COMPLETIONS_ARGS, HUGGINGFACE_ARGS,
@@ -488,7 +487,6 @@ class DatasetArguments:
     )
 
     continue_from: ClassVar[int] = 0
-    """The number of instances (lines) in .json file to resume from. This is set in `PredictionWriter.write_metainfo`."""
 
     # set in `set_logging` with format "{evaluation_results_dir}/{log_filename}.json"
     evaluation_results_path: ClassVar[Optional[str]] = None
@@ -515,6 +513,9 @@ def __post_init__(self):
                     f"Invalid batch size: {self.batch_size}. Specify an integer (e.g., '10') to use a fixed batch size for all iterations. Alternatively, append ':auto' (e.g., '10:auto') to start with the specified batch size and automatically adjust it in subsequent iterations to maintain constant CUDA memory usage"
                 )
 
+        if self.hf_mirror:
+            os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+
 
 @dataclass
 class EvaluationArguments:
@@ -572,6 +573,18 @@ class EvaluationArguments:
         default=False,
         help="Whether to skip metrics evaluation and only do inference",
     )
+    hfd_exclude_pattern: str = HfArg(
+        default=None,
+        help="The exclude pattern for datasets downloaded with hfd.sh",
+    )
+    hfd_include_pattern: str = HfArg(
+        default=None,
+        help="The include pattern for datasets downloaded with hfd.sh",
+    )
+    hfd_skip_check: bool = HfArg(
+        default=False,
+        help="Whether to skip the check of hfd.sh",
+    )
 
     _redact = {"hf_token"}
 
diff --git a/utilization/utils/hfd.py b/utilization/utils/hfd.py
index 80e5c4fc..b4685ee6 100644
--- a/utilization/utils/hfd.py
+++ b/utilization/utils/hfd.py
@@ -41,6 +41,7 @@ def huggingface_download(
     hf_token: Optional[str] = None,
     old: str = "https://huggingface.co",
     new: str = "https://hf-mirror.com",
+    evaluation_args=None,
 ) -> Optional[str]:
     """Download a dataset from Hugging Face Hub to a local directory using hfd.sh."""
 
@@ -62,14 +63,23 @@ def huggingface_download(
     logger.debug(f"Downloading {path} to {repo_path}")
 
     mirror_flag = " --mirror" if mirror else ""
+
     if hf_username and hf_token:
         auth = f" --hf_username {hf_username} --hf_token {hf_token}"
     elif hf_token:
         auth = f" --hf_token {hf_token}"
     else:
         auth = ""
+
+    if evaluation_args is not None:
+        if evaluation_args.hfd_exclude_pattern is not None:
+            auth += f" --exclude-pattern {evaluation_args.hfd_exclude_pattern}"
+        if evaluation_args.hfd_include_pattern is not None:
+            auth += f" --include-pattern {evaluation_args.hfd_include_pattern}"
+
     command = f"bash {hfd_cli.as_posix()} {path} --dataset --local-dir {repo_path.as_posix()}{auth}"
-    os.system(command + mirror_flag)
+    if not evaluation_args.hfd_skip_check:
+        os.system(command + mirror_flag)
 
     update_script(load_script_path, mirror, old, new)