how add dataset

OpenBMB · Jan 16, 2025 · 744fdef · 744fdef
1 parent 73a1065
commit 744fdef
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 5 deletions.
diff --git a/audio_evals/dataset/huggingface.py b/audio_evals/dataset/huggingface.py
@@ -44,7 +44,11 @@ def load_audio_hf_dataset(name, subset=None, split="", local_path="", col_aliase
             load_args["name"] = subset
         if split:
             load_args["split"] = split
-        ds = load_dataset(**load_args)
+        try:
+            ds = load_dataset(**load_args, trust_remote_code=True)
+        except Exception as e:
+            logger.error(f"load args is {load_args}load dataset error: {e}")
+            raise e
 
     for k, v in col_aliases.items():
         if v in ds.column_names:
@@ -78,7 +82,7 @@ def __init__(
         subset: Optional[str] = None,
         split: str = "",
         local_path: str = "",
-        col_aliases: Dict[str, str] = None
+        col_aliases: Dict[str, str] = None,
     ):
         super().__init__(default_task, ref_col, col_aliases)
         self.name = name

diff --git a/docs/how add a dataset.md b/docs/how add a dataset.md
@@ -11,7 +11,7 @@ here are steps:
 ## JSON file:
 
 ### register the dataset
-1. make sure your dataset file is `jsonl` format
+1. make sure your dataset file is `jsonl` format and with `WavPath` column which specific the audio file path.
 2. new a file `**.yaml` in `registry/dataset/`
     content like :
     ```yaml
@@ -20,6 +20,32 @@ here are steps:
    args:
      default_task: alei_asr  # you should specify an eval task as default, you can find valid task in  `registry/eval_task`
      f_name:  # the file name
-     ref_col:  # the reference column name in file
+     ref_col:  # the reference answer column name in file
     ```
 after registry dataset, you can eval your dataset with --dataset $name, enjoy 😘
+
+Example:
+
+1. create a file `my_dataset.jsonl` with `WavPath` and `Transcript` columns, the content like this:
+```json lines
+{"WavPath": "path/to/audio1.wav", "Transcript": "this is the first audio"}
+{"WavPath": "path/to/audio2.wav", "Transcript": "this is the second audio"}
+```
+
+2. create a file `my_dataset.yaml` in `registry/dataset/` with content:
+```yaml
+my_dataset:
+  class: audio_evals.dataset.dataset.JsonlFile
+  args:
+    default_task: asr
+    f_name: my_dataset.jsonl     # the file name
+    ref_col: Transcript           # the reference answer column name in file
+```
+
+3. eval your dataset with `--dataset my_dataset`
+
+```sh
+export PYTHONPATH=$PWD:$PYTHONPATH
+export OPENAI_API_KEY=$your-key
+python audio_evals/main.py --dataset my_dataset --model gpt4o_audio
+```
diff --git a/registry/dataset/GigaSpeech.yaml b/registry/dataset/GigaSpeech.yaml
@@ -3,5 +3,5 @@ gigaspeech:
   args:
     default_task: asr
     name: speechcolab/gigaspeech
-    split: test
+    subset: test
     ref_col: text