From de3f6640dfd089068ebe1cbe24d04315ae7f56da Mon Sep 17 00:00:00 2001
From: simon <simonott@gmx.at>
Date: Fri, 22 Apr 2022 12:59:36 +0200
Subject: [PATCH 1/4] Initial head qa dataset

---
 biodatasets/head_qa/head_qa.py | 349 +++++++++++++++++++++++++++++++++
 1 file changed, 349 insertions(+)
 create mode 100644 biodatasets/head_qa/head_qa.py

diff --git a/biodatasets/head_qa/head_qa.py b/biodatasets/head_qa/head_qa.py
new file mode 100644
index 00000000..17d8183f
--- /dev/null
+++ b/biodatasets/head_qa/head_qa.py
@@ -0,0 +1,349 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
+de Sanidad, Consumo y Bienestar Social.
+The dataset contains questions about following topics: medicine, nursing, psychology, chemistry,
+pharmacology and biology.
+
+Original code: https://huggingface.co/datasets/head_qa/blob/main/head_qa.py
+"""
+
+import json
+import os
+from typing import Dict, List, Tuple
+
+import datasets
+
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+_CITATION = """\
+@inproceedings{vilares-gomez-rodriguez-2019-head,
+    title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
+    author = "Vilares, David  and
+      G{\'o}mez-Rodr{\'i}guez, Carlos",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1092",
+    doi = "10.18653/v1/P19-1092",
+    pages = "960--966"}
+"""
+
+_DATASETNAME = "head_qa"
+
+_DESCRIPTION = """\
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
+de Sanidad, Consumo y Bienestar Social.
+The dataset contains questions about following topics: medicine, nursing, psychology, chemistry,
+pharmacology and biology.
+"""
+
+_HOMEPAGE = "https://aghie.github.io/head-qa/"
+
+_LICENSE = "MIT License"
+
+_URLS = {
+    _DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t",
+}
+
+_SUPPORTED_TASKS = [Tasks.TRANSLATION, Tasks.QUESTION_ANSWERING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_BIGBIO_VERSION = "1.0.0"
+
+
+class HeadQADataset(datasets.GeneratorBasedBuilder):
+    """HEAD-QA: A Healthcare Dataset for Complex Reasoning"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="head_qa_source",
+            version=SOURCE_VERSION,
+            description="HeadQA both languages source schema",
+            schema="source",
+            subset_id="head_qa",
+        ),
+        BigBioConfig(
+            name="head_qa_en_source",
+            version=SOURCE_VERSION,
+            description="HeadQA English source schema",
+            schema="source",
+            subset_id="head_qa_en",
+        ),
+        BigBioConfig(
+            name="head_qa_es_source",
+            version=SOURCE_VERSION,
+            description="HeadQA Spanish source schema",
+            schema="source",
+            subset_id="head_qa_es",
+        ),
+        BigBioConfig(
+            name="head_qa_bigbio_t2t",
+            version=BIGBIO_VERSION,
+            description="HeadQA Translation BigBio schema",
+            schema="bigbio_t2t",
+            subset_id="head_qa",
+        ),
+        BigBioConfig(
+            name="head_qa_en_bigbio_qa",
+            version=BIGBIO_VERSION,
+            description="HeadQA English Question Answering BigBio schema",
+            schema="bigbio_qa",
+            subset_id="head_qa_en",
+        ),
+        BigBioConfig(
+            name="head_qa_es_bigbio_qa",
+            version=BIGBIO_VERSION,
+            description="HeadQA Spanish Question Answering BigBio schema",
+            schema="bigbio_qa",
+            subset_id="head_qa_es",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "head_qa_en_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source" and self.config.subset_id == "head_qa":
+            features = datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "year": datasets.Value("string"),
+                    "category": datasets.Value("string"),
+                    "qid": datasets.Value("int32"),
+                    "qtext": {
+                        "en": datasets.Value("string"),
+                        "es": datasets.Value("string"),
+                    },
+                    "ra": datasets.Value("int32"),
+                    "image": datasets.Image(),
+                    "answers": [
+                        {
+                            "aid": datasets.Value("int32"),
+                            "atext": {
+                                "en": datasets.Value("string"),
+                                "es": datasets.Value("string"),
+                            },
+                        }
+                    ],
+                }
+            )
+        elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]:
+            features = datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "year": datasets.Value("string"),
+                    "category": datasets.Value("string"),
+                    "qid": datasets.Value("int32"),
+                    "qtext": datasets.Value("string"),
+                    "ra": datasets.Value("int32"),
+                    "image": datasets.Image(),
+                    "answers": [
+                        {
+                            "aid": datasets.Value("int32"),
+                            "atext": datasets.Value("string"),
+                        }
+                    ],
+                }
+            )
+        elif self.config.schema == "bigbio_t2t":
+            features = schemas.text2text_features
+        elif self.config.schema == "bigbio_qa":
+            features = schemas.qa_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "en_path": os.path.join(data_dir, "HEAD_EN", "train_HEAD_EN.json"),
+                    "es_path": os.path.join(data_dir, "HEAD", "train_HEAD.json"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "en_path": os.path.join(data_dir, "HEAD_EN", "test_HEAD_EN.json"),
+                    "es_path": os.path.join(data_dir, "HEAD", "test_HEAD.json"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "en_path": os.path.join(data_dir, "HEAD_EN", "dev_HEAD_EN.json"),
+                    "es_path": os.path.join(data_dir, "HEAD", "dev_HEAD.json"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, data_dir, en_path, es_path) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        if self.config.schema == "source" and self.config.subset_id == "head_qa":
+            for key, example in self._merge_documents(
+                self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path)
+            ):
+                yield key, example
+
+        elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]:
+            if self.config.subset_id == "head_qa_en":
+                filepath = en_path
+            elif self.config.subset_id == "head_qa_es":
+                filepath = es_path
+            for key, example in self._generate_source_documents(data_dir, filepath):
+                yield key, example
+
+        elif self.config.schema == "bigbio_t2t":
+            for key, example in self._merge_documents(
+                self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path)
+            ):
+                for key_t2t, example_t2t in self._generate_source_to_t2t(example):
+                    yield key_t2t, example_t2t
+
+        elif self.config.schema == "bigbio_qa":
+            if self.config.subset_id == "head_qa_en":
+                filepath = en_path
+            elif self.config.subset_id == "head_qa_es":
+                filepath = es_path
+            for key, example in self._generate_source_documents(data_dir, filepath):
+                yield key, self._source_to_qa(example)
+
+    def _generate_source_documents(self, data_dir, filepath):
+
+        with open(filepath, encoding="utf-8") as f:
+            head_qa = json.load(f)
+
+        for exam_id, exam in enumerate(head_qa["exams"]):
+            content = head_qa["exams"][exam]
+            name = content["name"].strip()
+            year = content["year"].strip()
+            category = content["category"].strip()
+            for question in content["data"]:
+                qid = int(question["qid"].strip())
+                qtext = question["qtext"].strip()
+                ra = int(question["ra"].strip())
+                image_path = question["image"].strip()
+
+                aids = [answer["aid"] for answer in question["answers"]]
+                atexts = [answer["atext"].strip() for answer in question["answers"]]
+                answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]
+
+                id_ = f"{exam_id}_{qid}"
+                yield id_, {
+                    "name": name,
+                    "year": year,
+                    "category": category,
+                    "qid": qid,
+                    "qtext": qtext,
+                    "ra": ra,
+                    "image": os.path.join(data_dir, image_path) if image_path else None,
+                    "answers": answers,
+                }
+
+    def _merge_documents(self, gen_en, gen_es):
+        for (doc_en_id, doc_en), (doc_es_id, doc_es) in zip(gen_en, gen_es):
+            assert doc_en_id == doc_es_id, "ohno"
+            self._assert_eq_doc(doc_en, doc_es)
+
+            doc_merge = doc_en.copy()
+            doc_merge["qtext"] = {"en": doc_en["qtext"], "es": doc_es["qtext"]}
+            answers = []
+            for answer_en, answer_es in zip(doc_en["answers"], doc_es["answers"]):
+                assert answer_en["aid"] == answer_es["aid"], "ohno"
+                answers.append(
+                    {
+                        "aid": answer_en["aid"],
+                        "atext": {
+                            "en": answer_en["atext"],
+                            "es": answer_es["atext"],
+                        },
+                    }
+                )
+            doc_merge["answers"] = answers
+            yield doc_en_id, doc_merge
+
+    def _assert_eq_doc(self, doc1, doc2):
+        doc1 = doc1.copy()
+        doc2 = doc2.copy()
+        doc1.pop("qtext")
+        doc1.pop("answers")
+        doc2.pop("qtext")
+        doc2.pop("answers")
+        assert doc1 == doc2, f"ohno {doc1} {doc2}"
+
+    def _source_to_qa(self, example):
+        example_ = {}
+        example_["id"] = example["name"] + "_qid_" + str(example["qid"])
+        example_["question_id"] = example["qid"]
+        example_["document_id"] = ""
+        example_["question"] = example["qtext"]
+        example_["type"] = example["category"]
+        example_["choices"] = [answer["atext"] for answer in example["answers"]]
+        example_["context"] = ""
+        example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]]
+
+        return example_
+
+    def _generate_source_to_t2t(self, example):
+        id = example["name"] + "_qid_" + str(example["qid"])
+        example_ = {
+            "id": id,
+            "document_id": "",
+            "text_1": example["qtext"]["en"],
+            "text_2": example["qtext"]["es"],
+            "text_1_name": "en",
+            "text_2_name": "es",
+        }
+        yield id, example_
+
+        for answer in example["answers"]:
+            id = example["name"] + "_qid_" + str(example["qid"]) + "_aid_" + str(answer["aid"])
+            example_ = {
+                "id": id,
+                "document_id": "",
+                "text_1": answer["atext"]["en"],
+                "text_2": answer["atext"]["es"],
+                "text_1_name": "en",
+                "text_2_name": "es",
+            }
+            yield id, example_

From d5d1ca50838c6e6bd810e6c843adaedd33cb5915 Mon Sep 17 00:00:00 2001
From: Simon Ott <simonott@gmx.at>
Date: Wed, 27 Apr 2022 09:38:13 +0200
Subject: [PATCH 2/4] Removed t2t schema

---
 biodatasets/head_qa/head_qa.py | 146 ++++-----------------------------
 1 file changed, 15 insertions(+), 131 deletions(-)

diff --git a/biodatasets/head_qa/head_qa.py b/biodatasets/head_qa/head_qa.py
index 17d8183f..52abaecd 100644
--- a/biodatasets/head_qa/head_qa.py
+++ b/biodatasets/head_qa/head_qa.py
@@ -66,7 +66,7 @@
     _DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t",
 }
 
-_SUPPORTED_TASKS = [Tasks.TRANSLATION, Tasks.QUESTION_ANSWERING]
+_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
 
 _SOURCE_VERSION = "1.0.0"
 
@@ -80,13 +80,6 @@ class HeadQADataset(datasets.GeneratorBasedBuilder):
     BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
 
     BUILDER_CONFIGS = [
-        BigBioConfig(
-            name="head_qa_source",
-            version=SOURCE_VERSION,
-            description="HeadQA both languages source schema",
-            schema="source",
-            subset_id="head_qa",
-        ),
         BigBioConfig(
             name="head_qa_en_source",
             version=SOURCE_VERSION,
@@ -101,13 +94,6 @@ class HeadQADataset(datasets.GeneratorBasedBuilder):
             schema="source",
             subset_id="head_qa_es",
         ),
-        BigBioConfig(
-            name="head_qa_bigbio_t2t",
-            version=BIGBIO_VERSION,
-            description="HeadQA Translation BigBio schema",
-            schema="bigbio_t2t",
-            subset_id="head_qa",
-        ),
         BigBioConfig(
             name="head_qa_en_bigbio_qa",
             version=BIGBIO_VERSION,
@@ -127,32 +113,7 @@ class HeadQADataset(datasets.GeneratorBasedBuilder):
     DEFAULT_CONFIG_NAME = "head_qa_en_source"
 
     def _info(self) -> datasets.DatasetInfo:
-
-        if self.config.schema == "source" and self.config.subset_id == "head_qa":
-            features = datasets.Features(
-                {
-                    "name": datasets.Value("string"),
-                    "year": datasets.Value("string"),
-                    "category": datasets.Value("string"),
-                    "qid": datasets.Value("int32"),
-                    "qtext": {
-                        "en": datasets.Value("string"),
-                        "es": datasets.Value("string"),
-                    },
-                    "ra": datasets.Value("int32"),
-                    "image": datasets.Image(),
-                    "answers": [
-                        {
-                            "aid": datasets.Value("int32"),
-                            "atext": {
-                                "en": datasets.Value("string"),
-                                "es": datasets.Value("string"),
-                            },
-                        }
-                    ],
-                }
-            )
-        elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]:
+        if self.config.schema == "source":
             features = datasets.Features(
                 {
                     "name": datasets.Value("string"),
@@ -170,8 +131,6 @@ def _info(self) -> datasets.DatasetInfo:
                     ],
                 }
             )
-        elif self.config.schema == "bigbio_t2t":
-            features = schemas.text2text_features
         elif self.config.schema == "bigbio_qa":
             features = schemas.qa_features
 
@@ -189,63 +148,43 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         urls = _URLS[_DATASETNAME]
         data_dir = dl_manager.download_and_extract(urls)
 
+        if self.config.subset_id == "head_qa_en":
+            file_path = os.path.join("HEAD_EN", "train_HEAD_EN.json")
+        elif self.config.subset_id == "head_qa_es":
+            file_path = os.path.join("HEAD", "train_HEAD.json")
+
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
                     "data_dir": data_dir,
-                    "en_path": os.path.join(data_dir, "HEAD_EN", "train_HEAD_EN.json"),
-                    "es_path": os.path.join(data_dir, "HEAD", "train_HEAD.json"),
+                    "file_path": os.path.join(data_dir, file_path),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
                     "data_dir": data_dir,
-                    "en_path": os.path.join(data_dir, "HEAD_EN", "test_HEAD_EN.json"),
-                    "es_path": os.path.join(data_dir, "HEAD", "test_HEAD.json"),
+                    "file_path": os.path.join(data_dir, file_path),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
                     "data_dir": data_dir,
-                    "en_path": os.path.join(data_dir, "HEAD_EN", "dev_HEAD_EN.json"),
-                    "es_path": os.path.join(data_dir, "HEAD", "dev_HEAD.json"),
+                    "file_path": os.path.join(data_dir, file_path),
                 },
             ),
         ]
 
-    def _generate_examples(self, data_dir, en_path, es_path) -> Tuple[int, Dict]:
+    def _generate_examples(self, data_dir, file_path) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
 
-        if self.config.schema == "source" and self.config.subset_id == "head_qa":
-            for key, example in self._merge_documents(
-                self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path)
-            ):
-                yield key, example
-
-        elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]:
-            if self.config.subset_id == "head_qa_en":
-                filepath = en_path
-            elif self.config.subset_id == "head_qa_es":
-                filepath = es_path
-            for key, example in self._generate_source_documents(data_dir, filepath):
+        if self.config.schema == "source":
+            for key, example in self._generate_source_documents(data_dir, file_path):
                 yield key, example
-
-        elif self.config.schema == "bigbio_t2t":
-            for key, example in self._merge_documents(
-                self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path)
-            ):
-                for key_t2t, example_t2t in self._generate_source_to_t2t(example):
-                    yield key_t2t, example_t2t
-
         elif self.config.schema == "bigbio_qa":
-            if self.config.subset_id == "head_qa_en":
-                filepath = en_path
-            elif self.config.subset_id == "head_qa_es":
-                filepath = es_path
-            for key, example in self._generate_source_documents(data_dir, filepath):
+            for key, example in self._generate_source_documents(data_dir, file_path):
                 yield key, self._source_to_qa(example)
 
     def _generate_source_documents(self, data_dir, filepath):
@@ -280,70 +219,15 @@ def _generate_source_documents(self, data_dir, filepath):
                     "answers": answers,
                 }
 
-    def _merge_documents(self, gen_en, gen_es):
-        for (doc_en_id, doc_en), (doc_es_id, doc_es) in zip(gen_en, gen_es):
-            assert doc_en_id == doc_es_id, "ohno"
-            self._assert_eq_doc(doc_en, doc_es)
-
-            doc_merge = doc_en.copy()
-            doc_merge["qtext"] = {"en": doc_en["qtext"], "es": doc_es["qtext"]}
-            answers = []
-            for answer_en, answer_es in zip(doc_en["answers"], doc_es["answers"]):
-                assert answer_en["aid"] == answer_es["aid"], "ohno"
-                answers.append(
-                    {
-                        "aid": answer_en["aid"],
-                        "atext": {
-                            "en": answer_en["atext"],
-                            "es": answer_es["atext"],
-                        },
-                    }
-                )
-            doc_merge["answers"] = answers
-            yield doc_en_id, doc_merge
-
-    def _assert_eq_doc(self, doc1, doc2):
-        doc1 = doc1.copy()
-        doc2 = doc2.copy()
-        doc1.pop("qtext")
-        doc1.pop("answers")
-        doc2.pop("qtext")
-        doc2.pop("answers")
-        assert doc1 == doc2, f"ohno {doc1} {doc2}"
-
     def _source_to_qa(self, example):
         example_ = {}
         example_["id"] = example["name"] + "_qid_" + str(example["qid"])
         example_["question_id"] = example["qid"]
         example_["document_id"] = ""
         example_["question"] = example["qtext"]
-        example_["type"] = example["category"]
+        example_["type"] = "multiple_choice"
         example_["choices"] = [answer["atext"] for answer in example["answers"]]
         example_["context"] = ""
         example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]]
 
         return example_
-
-    def _generate_source_to_t2t(self, example):
-        id = example["name"] + "_qid_" + str(example["qid"])
-        example_ = {
-            "id": id,
-            "document_id": "",
-            "text_1": example["qtext"]["en"],
-            "text_2": example["qtext"]["es"],
-            "text_1_name": "en",
-            "text_2_name": "es",
-        }
-        yield id, example_
-
-        for answer in example["answers"]:
-            id = example["name"] + "_qid_" + str(example["qid"]) + "_aid_" + str(answer["aid"])
-            example_ = {
-                "id": id,
-                "document_id": "",
-                "text_1": answer["atext"]["en"],
-                "text_2": answer["atext"]["es"],
-                "text_1_name": "en",
-                "text_2_name": "es",
-            }
-            yield id, example_

From 251a0695450cf194f363ce55af0c48307ebb0cbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Thu, 24 Oct 2024 18:01:45 +0200
Subject: [PATCH 3/4] refactor: Refactor HEAD-QA dataset implementation to
 hub-based schema

---
 bigbio/hub/hub_repos/head_qa/README.md        |  50 ++
 bigbio/hub/hub_repos/head_qa/bigbiohub.py     | 590 ++++++++++++++++++
 .../hub/hub_repos}/head_qa/head_qa.py         | 127 ++--
 3 files changed, 702 insertions(+), 65 deletions(-)
 create mode 100644 bigbio/hub/hub_repos/head_qa/README.md
 create mode 100644 bigbio/hub/hub_repos/head_qa/bigbiohub.py
 rename {biodatasets => bigbio/hub/hub_repos}/head_qa/head_qa.py (66%)

diff --git a/bigbio/hub/hub_repos/head_qa/README.md b/bigbio/hub/hub_repos/head_qa/README.md
new file mode 100644
index 00000000..a1f255b3
--- /dev/null
+++ b/bigbio/hub/hub_repos/head_qa/README.md
@@ -0,0 +1,50 @@
+---
+language:
+  - es
+  - en
+bigbio_language:
+  - English
+  - Spanish
+license: mit
+bigbio_license_shortname: MIT
+multilinguality: monolingual
+pretty_name: HEAD-QA
+homepage: https://aghie.github.io/head-qa/
+bigbio_pubmed: false
+bigbio_public: true
+bigbio_tasks:
+  - QUESTION_ANSWERING
+---
+
+
+# Dataset Card for HEAD-QA
+
+## Dataset Description
+
+- **Homepage:** https://aghie.github.io/head-qa/
+- **Pubmed:** False
+- **Public:** True
+- **Tasks:** QA
+
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the 
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the 
+Ministerio de Sanidad, Consumo y Bienestar Social.The dataset contains questions about following topics: medicine, 
+nursing, psychology, chemistry, pharmacology and biology.
+
+
+## Citation Information
+
+```
+@inproceedings{vilares-gomez-rodriguez-2019-head,
+    title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
+    author = "Vilares, David  and G{\'o}mez-Rodr{\'i}guez, Carlos",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1092",
+    doi = "10.18653/v1/P19-1092",
+    pages = "960--966"
+}
+```
diff --git a/bigbio/hub/hub_repos/head_qa/bigbiohub.py b/bigbio/hub/hub_repos/head_qa/bigbiohub.py
new file mode 100644
index 00000000..f4da7bb7
--- /dev/null
+++ b/bigbio/hub/hub_repos/head_qa/bigbiohub.py
@@ -0,0 +1,590 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from enum import Enum
+import logging
+from pathlib import Path
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple
+
+import datasets
+
+if TYPE_CHECKING:
+    import bioc
+
+logger = logging.getLogger(__name__)
+
+
+BigBioValues = SimpleNamespace(NULL="<BB_NULL_STR>")
+
+
+@dataclass
+class BigBioConfig(datasets.BuilderConfig):
+    """BuilderConfig for BigBio."""
+
+    name: str = None
+    version: datasets.Version = None
+    description: str = None
+    schema: str = None
+    subset_id: str = None
+
+
+class Tasks(Enum):
+    NAMED_ENTITY_RECOGNITION = "NER"
+    NAMED_ENTITY_DISAMBIGUATION = "NED"
+    EVENT_EXTRACTION = "EE"
+    RELATION_EXTRACTION = "RE"
+    COREFERENCE_RESOLUTION = "COREF"
+    QUESTION_ANSWERING = "QA"
+    TEXTUAL_ENTAILMENT = "TE"
+    SEMANTIC_SIMILARITY = "STS"
+    TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS"
+    PARAPHRASING = "PARA"
+    TRANSLATION = "TRANSL"
+    SUMMARIZATION = "SUM"
+    TEXT_CLASSIFICATION = "TXTCLASS"
+
+
+entailment_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "premise": datasets.Value("string"),
+        "hypothesis": datasets.Value("string"),
+        "label": datasets.Value("string"),
+    }
+)
+
+pairs_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "text_1": datasets.Value("string"),
+        "text_2": datasets.Value("string"),
+        "label": datasets.Value("string"),
+    }
+)
+
+qa_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "question_id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "question": datasets.Value("string"),
+        "type": datasets.Value("string"),
+        "choices": [datasets.Value("string")],
+        "context": datasets.Value("string"),
+        "answer": datasets.Sequence(datasets.Value("string")),
+    }
+)
+
+text_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "text": datasets.Value("string"),
+        "labels": [datasets.Value("string")],
+    }
+)
+
+text2text_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "text_1": datasets.Value("string"),
+        "text_2": datasets.Value("string"),
+        "text_1_name": datasets.Value("string"),
+        "text_2_name": datasets.Value("string"),
+    }
+)
+
+kb_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "passages": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "text": datasets.Sequence(datasets.Value("string")),
+                "offsets": datasets.Sequence([datasets.Value("int32")]),
+            }
+        ],
+        "entities": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "text": datasets.Sequence(datasets.Value("string")),
+                "offsets": datasets.Sequence([datasets.Value("int32")]),
+                "normalized": [
+                    {
+                        "db_name": datasets.Value("string"),
+                        "db_id": datasets.Value("string"),
+                    }
+                ],
+            }
+        ],
+        "events": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                # refers to the text_bound_annotation of the trigger
+                "trigger": {
+                    "text": datasets.Sequence(datasets.Value("string")),
+                    "offsets": datasets.Sequence([datasets.Value("int32")]),
+                },
+                "arguments": [
+                    {
+                        "role": datasets.Value("string"),
+                        "ref_id": datasets.Value("string"),
+                    }
+                ],
+            }
+        ],
+        "coreferences": [
+            {
+                "id": datasets.Value("string"),
+                "entity_ids": datasets.Sequence(datasets.Value("string")),
+            }
+        ],
+        "relations": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "arg1_id": datasets.Value("string"),
+                "arg2_id": datasets.Value("string"),
+                "normalized": [
+                    {
+                        "db_name": datasets.Value("string"),
+                        "db_id": datasets.Value("string"),
+                    }
+                ],
+            }
+        ],
+    }
+)
+
+
+TASK_TO_SCHEMA = {
+    Tasks.NAMED_ENTITY_RECOGNITION.name: "KB",
+    Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB",
+    Tasks.EVENT_EXTRACTION.name: "KB",
+    Tasks.RELATION_EXTRACTION.name: "KB",
+    Tasks.COREFERENCE_RESOLUTION.name: "KB",
+    Tasks.QUESTION_ANSWERING.name: "QA",
+    Tasks.TEXTUAL_ENTAILMENT.name: "TE",
+    Tasks.SEMANTIC_SIMILARITY.name: "PAIRS",
+    Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS",
+    Tasks.PARAPHRASING.name: "T2T",
+    Tasks.TRANSLATION.name: "T2T",
+    Tasks.SUMMARIZATION.name: "T2T",
+    Tasks.TEXT_CLASSIFICATION.name: "TEXT",
+}
+
+SCHEMA_TO_TASKS = defaultdict(set)
+for task, schema in TASK_TO_SCHEMA.items():
+    SCHEMA_TO_TASKS[schema].add(task)
+SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS)
+
+VALID_TASKS = set(TASK_TO_SCHEMA.keys())
+VALID_SCHEMAS = set(TASK_TO_SCHEMA.values())
+
+SCHEMA_TO_FEATURES = {
+    "KB": kb_features,
+    "QA": qa_features,
+    "TE": entailment_features,
+    "T2T": text2text_features,
+    "TEXT": text_features,
+    "PAIRS": pairs_features,
+}
+
+
+def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple:
+
+    offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations]
+
+    text = ann.text
+
+    if len(offsets) > 1:
+        i = 0
+        texts = []
+        for start, end in offsets:
+            chunk_len = end - start
+            texts.append(text[i : chunk_len + i])
+            i += chunk_len
+            while i < len(text) and text[i] == " ":
+                i += 1
+    else:
+        texts = [text]
+
+    return offsets, texts
+
+
+def remove_prefix(a: str, prefix: str) -> str:
+    if a.startswith(prefix):
+        a = a[len(prefix) :]
+    return a
+
+
+def parse_brat_file(
+    txt_file: Path,
+    annotation_file_suffixes: List[str] = None,
+    parse_notes: bool = False,
+) -> Dict:
+    """
+    Parse a brat file into the schema defined below.
+    `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt'
+    Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files,
+    e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'.
+    Will include annotator notes, when `parse_notes == True`.
+    brat_features = datasets.Features(
+        {
+            "id": datasets.Value("string"),
+            "document_id": datasets.Value("string"),
+            "text": datasets.Value("string"),
+            "text_bound_annotations": [  # T line in brat, e.g. type or event trigger
+                {
+                    "offsets": datasets.Sequence([datasets.Value("int32")]),
+                    "text": datasets.Sequence(datasets.Value("string")),
+                    "type": datasets.Value("string"),
+                    "id": datasets.Value("string"),
+                }
+            ],
+            "events": [  # E line in brat
+                {
+                    "trigger": datasets.Value(
+                        "string"
+                    ),  # refers to the text_bound_annotation of the trigger,
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "arguments": datasets.Sequence(
+                        {
+                            "role": datasets.Value("string"),
+                            "ref_id": datasets.Value("string"),
+                        }
+                    ),
+                }
+            ],
+            "relations": [  # R line in brat
+                {
+                    "id": datasets.Value("string"),
+                    "head": {
+                        "ref_id": datasets.Value("string"),
+                        "role": datasets.Value("string"),
+                    },
+                    "tail": {
+                        "ref_id": datasets.Value("string"),
+                        "role": datasets.Value("string"),
+                    },
+                    "type": datasets.Value("string"),
+                }
+            ],
+            "equivalences": [  # Equiv line in brat
+                {
+                    "id": datasets.Value("string"),
+                    "ref_ids": datasets.Sequence(datasets.Value("string")),
+                }
+            ],
+            "attributes": [  # M or A lines in brat
+                {
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "ref_id": datasets.Value("string"),
+                    "value": datasets.Value("string"),
+                }
+            ],
+            "normalizations": [  # N lines in brat
+                {
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "ref_id": datasets.Value("string"),
+                    "resource_name": datasets.Value(
+                        "string"
+                    ),  # Name of the resource, e.g. "Wikipedia"
+                    "cuid": datasets.Value(
+                        "string"
+                    ),  # ID in the resource, e.g. 534366
+                    "text": datasets.Value(
+                        "string"
+                    ),  # Human readable description/name of the entity, e.g. "Barack Obama"
+                }
+            ],
+            ### OPTIONAL: Only included when `parse_notes == True`
+            "notes": [  # # lines in brat
+                {
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "ref_id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                }
+            ],
+        },
+        )
+    """
+
+    example = {}
+    example["document_id"] = txt_file.with_suffix("").name
+    with txt_file.open() as f:
+        example["text"] = f.read()
+
+    # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes
+    # for event extraction
+    if annotation_file_suffixes is None:
+        annotation_file_suffixes = [".a1", ".a2", ".ann"]
+
+    if len(annotation_file_suffixes) == 0:
+        raise AssertionError(
+            "At least one suffix for the to-be-read annotation files should be given!"
+        )
+
+    ann_lines = []
+    for suffix in annotation_file_suffixes:
+        annotation_file = txt_file.with_suffix(suffix)
+        if annotation_file.exists():
+            with annotation_file.open() as f:
+                ann_lines.extend(f.readlines())
+
+    example["text_bound_annotations"] = []
+    example["events"] = []
+    example["relations"] = []
+    example["equivalences"] = []
+    example["attributes"] = []
+    example["normalizations"] = []
+
+    if parse_notes:
+        example["notes"] = []
+
+    for line in ann_lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        if line.startswith("T"):  # Text bound
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["type"] = fields[1].split()[0]
+            ann["offsets"] = []
+            span_str = remove_prefix(fields[1], (ann["type"] + " "))
+            text = fields[2]
+            for span in span_str.split(";"):
+                start, end = span.split()
+                ann["offsets"].append([int(start), int(end)])
+
+            # Heuristically split text of discontiguous entities into chunks
+            ann["text"] = []
+            if len(ann["offsets"]) > 1:
+                i = 0
+                for start, end in ann["offsets"]:
+                    chunk_len = end - start
+                    ann["text"].append(text[i : chunk_len + i])
+                    i += chunk_len
+                    while i < len(text) and text[i] == " ":
+                        i += 1
+            else:
+                ann["text"] = [text]
+
+            example["text_bound_annotations"].append(ann)
+
+        elif line.startswith("E"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+
+            ann["type"], ann["trigger"] = fields[1].split()[0].split(":")
+
+            ann["arguments"] = []
+            for role_ref_id in fields[1].split()[1:]:
+                argument = {
+                    "role": (role_ref_id.split(":"))[0],
+                    "ref_id": (role_ref_id.split(":"))[1],
+                }
+                ann["arguments"].append(argument)
+
+            example["events"].append(ann)
+
+        elif line.startswith("R"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["type"] = fields[1].split()[0]
+
+            ann["head"] = {
+                "role": fields[1].split()[1].split(":")[0],
+                "ref_id": fields[1].split()[1].split(":")[1],
+            }
+            ann["tail"] = {
+                "role": fields[1].split()[2].split(":")[0],
+                "ref_id": fields[1].split()[2].split(":")[1],
+            }
+
+            example["relations"].append(ann)
+
+        # '*' seems to be the legacy way to mark equivalences,
+        # but I couldn't find any info on the current way
+        # this might have to be adapted dependent on the brat version
+        # of the annotation
+        elif line.startswith("*"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["ref_ids"] = fields[1].split()[1:]
+
+            example["equivalences"].append(ann)
+
+        elif line.startswith("A") or line.startswith("M"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+
+            info = fields[1].split()
+            ann["type"] = info[0]
+            ann["ref_id"] = info[1]
+
+            if len(info) > 2:
+                ann["value"] = info[2]
+            else:
+                ann["value"] = ""
+
+            example["attributes"].append(ann)
+
+        elif line.startswith("N"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["text"] = fields[2]
+
+            info = fields[1].split()
+
+            ann["type"] = info[0]
+            ann["ref_id"] = info[1]
+            ann["resource_name"] = info[2].split(":")[0]
+            ann["cuid"] = info[2].split(":")[1]
+            example["normalizations"].append(ann)
+
+        elif parse_notes and line.startswith("#"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL
+
+            info = fields[1].split()
+
+            ann["type"] = info[0]
+            ann["ref_id"] = info[1]
+            example["notes"].append(ann)
+
+    return example
+
+
+def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict:
+    """
+    Transform a brat parse (conforming to the standard brat schema) obtained with
+    `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py)
+    :param brat_parse:
+    """
+
+    unified_example = {}
+
+    # Prefix all ids with document id to ensure global uniqueness,
+    # because brat ids are only unique within their document
+    id_prefix = brat_parse["document_id"] + "_"
+
+    # identical
+    unified_example["document_id"] = brat_parse["document_id"]
+    unified_example["passages"] = [
+        {
+            "id": id_prefix + "_text",
+            "type": "abstract",
+            "text": [brat_parse["text"]],
+            "offsets": [[0, len(brat_parse["text"])]],
+        }
+    ]
+
+    # get normalizations
+    ref_id_to_normalizations = defaultdict(list)
+    for normalization in brat_parse["normalizations"]:
+        ref_id_to_normalizations[normalization["ref_id"]].append(
+            {
+                "db_name": normalization["resource_name"],
+                "db_id": normalization["cuid"],
+            }
+        )
+
+    # separate entities and event triggers
+    unified_example["events"] = []
+    non_event_ann = brat_parse["text_bound_annotations"].copy()
+    for event in brat_parse["events"]:
+        event = event.copy()
+        event["id"] = id_prefix + event["id"]
+        trigger = next(
+            tr
+            for tr in brat_parse["text_bound_annotations"]
+            if tr["id"] == event["trigger"]
+        )
+        if trigger in non_event_ann:
+            non_event_ann.remove(trigger)
+        event["trigger"] = {
+            "text": trigger["text"].copy(),
+            "offsets": trigger["offsets"].copy(),
+        }
+        for argument in event["arguments"]:
+            argument["ref_id"] = id_prefix + argument["ref_id"]
+
+        unified_example["events"].append(event)
+
+    unified_example["entities"] = []
+    anno_ids = [ref_id["id"] for ref_id in non_event_ann]
+    for ann in non_event_ann:
+        entity_ann = ann.copy()
+        entity_ann["id"] = id_prefix + entity_ann["id"]
+        entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]]
+        unified_example["entities"].append(entity_ann)
+
+    # massage relations
+    unified_example["relations"] = []
+    skipped_relations = set()
+    for ann in brat_parse["relations"]:
+        if (
+            ann["head"]["ref_id"] not in anno_ids
+            or ann["tail"]["ref_id"] not in anno_ids
+        ):
+            skipped_relations.add(ann["id"])
+            continue
+        unified_example["relations"].append(
+            {
+                "arg1_id": id_prefix + ann["head"]["ref_id"],
+                "arg2_id": id_prefix + ann["tail"]["ref_id"],
+                "id": id_prefix + ann["id"],
+                "type": ann["type"],
+                "normalized": [],
+            }
+        )
+    if len(skipped_relations) > 0:
+        example_id = brat_parse["document_id"]
+        logger.info(
+            f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities."
+            f" Skip (for now): "
+            f"{list(skipped_relations)}"
+        )
+
+    # get coreferences
+    unified_example["coreferences"] = []
+    for i, ann in enumerate(brat_parse["equivalences"], start=1):
+        is_entity_cluster = True
+        for ref_id in ann["ref_ids"]:
+            if not ref_id.startswith("T"):  # not textbound -> no entity
+                is_entity_cluster = False
+            elif ref_id not in anno_ids:  # event trigger -> no entity
+                is_entity_cluster = False
+        if is_entity_cluster:
+            entity_ids = [id_prefix + i for i in ann["ref_ids"]]
+            unified_example["coreferences"].append(
+                {"id": id_prefix + str(i), "entity_ids": entity_ids}
+            )
+    return unified_example
diff --git a/biodatasets/head_qa/head_qa.py b/bigbio/hub/hub_repos/head_qa/head_qa.py
similarity index 66%
rename from biodatasets/head_qa/head_qa.py
rename to bigbio/hub/hub_repos/head_qa/head_qa.py
index 52abaecd..ee8d3283 100644
--- a/biodatasets/head_qa/head_qa.py
+++ b/bigbio/hub/hub_repos/head_qa/head_qa.py
@@ -13,31 +13,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""
-HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
-Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
-de Sanidad, Consumo y Bienestar Social.
-The dataset contains questions about following topics: medicine, nursing, psychology, chemistry,
-pharmacology and biology.
-
-Original code: https://huggingface.co/datasets/head_qa/blob/main/head_qa.py
-"""
-
 import json
-import os
+from pathlib import Path
 from typing import Dict, List, Tuple
 
 import datasets
 
-from utils import schemas
-from utils.configs import BigBioConfig
-from utils.constants import Tasks
+from .bigbiohub import BigBioConfig, Tasks, qa_features
+
+_LANGUAGES = ["English", "Spanish"]
+_LICENSE = "MIT"
+_LOCAL = False
+_PUBMED = False
 
 _CITATION = """\
 @inproceedings{vilares-gomez-rodriguez-2019-head,
     title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
-    author = "Vilares, David  and
-      G{\'o}mez-Rodr{\'i}guez, Carlos",
+    author = "Vilares, David  and G{\'o}mez-Rodr{\'i}guez, Carlos",
     booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
     month = jul,
     year = "2019",
@@ -45,31 +37,30 @@
     publisher = "Association for Computational Linguistics",
     url = "https://www.aclweb.org/anthology/P19-1092",
     doi = "10.18653/v1/P19-1092",
-    pages = "960--966"}
+    pages = "960--966"
+}
 """
 
 _DATASETNAME = "head_qa"
+_DISPLAYNAME = "HEAD-QA"
 
 _DESCRIPTION = """\
-HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
-Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
-de Sanidad, Consumo y Bienestar Social.
-The dataset contains questions about following topics: medicine, nursing, psychology, chemistry,
-pharmacology and biology.
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the \
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the \
+Ministerio de Sanidad, Consumo y Bienestar Social.The dataset contains questions about following topics: medicine, \
+nursing, psychology, chemistry, pharmacology and biology.
 """
 
 _HOMEPAGE = "https://aghie.github.io/head-qa/"
 
-_LICENSE = "MIT License"
-
 _URLS = {
-    _DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t",
+    "HEAD": "https://drive.usercontent.google.com/u/0/uc?id=1dUIqVwvoZAtbX_-z5axCoe97XNcFo1No&export=download",
+    "HEAD_EN": "https://drive.usercontent.google.com/u/0/uc?id=1phryJg4FjCFkn0mSCqIOP2-FscAeKGV0&export=download",
 }
 
 _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
 
 _SOURCE_VERSION = "1.0.0"
-
 _BIGBIO_VERSION = "1.0.0"
 
 
@@ -122,7 +113,6 @@ def _info(self) -> datasets.DatasetInfo:
                     "qid": datasets.Value("int32"),
                     "qtext": datasets.Value("string"),
                     "ra": datasets.Value("int32"),
-                    "image": datasets.Image(),
                     "answers": [
                         {
                             "aid": datasets.Value("int32"),
@@ -132,7 +122,9 @@ def _info(self) -> datasets.DatasetInfo:
                 }
             )
         elif self.config.schema == "bigbio_qa":
-            features = schemas.qa_features
+            features = qa_features
+        else:
+            raise NotImplementedError(f"Schema {self.config.schema} is not supported")
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -144,90 +136,95 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         """Returns SplitGenerators."""
-
-        urls = _URLS[_DATASETNAME]
-        data_dir = dl_manager.download_and_extract(urls)
-
         if self.config.subset_id == "head_qa_en":
-            file_path = os.path.join("HEAD_EN", "train_HEAD_EN.json")
+            data_dir = Path(dl_manager.download_and_extract(_URLS["HEAD_EN"])) / "HEAD_EN"
+            subset_name = "HEAD_EN"
+
         elif self.config.subset_id == "head_qa_es":
-            file_path = os.path.join("HEAD", "train_HEAD.json")
+            data_dir = Path(dl_manager.download_and_extract(_URLS["HEAD"])) / "HEAD"
+            subset_name = "HEAD"
+
+        else:
+            raise NotImplementedError(f"Subset {self.config.subset_id} is not supported")
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "data_dir": data_dir,
-                    "file_path": os.path.join(data_dir, file_path),
+                    "input_json_file": data_dir / f"train_{subset_name}.json",
                 },
             ),
             datasets.SplitGenerator(
-                name=datasets.Split.TEST,
+                name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "data_dir": data_dir,
-                    "file_path": os.path.join(data_dir, file_path),
+                    "input_json_file": data_dir / f"dev_{subset_name}.json",
                 },
             ),
             datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
+                name=datasets.Split.TEST,
                 gen_kwargs={
-                    "data_dir": data_dir,
-                    "file_path": os.path.join(data_dir, file_path),
+                    "input_json_file": data_dir / f"test_{subset_name}.json",
                 },
             ),
         ]
 
-    def _generate_examples(self, data_dir, file_path) -> Tuple[int, Dict]:
+    def _generate_examples(self, input_json_file: Path) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
 
         if self.config.schema == "source":
-            for key, example in self._generate_source_documents(data_dir, file_path):
+            for key, example in self._generate_source_documents(input_json_file):
                 yield key, example
+
         elif self.config.schema == "bigbio_qa":
-            for key, example in self._generate_source_documents(data_dir, file_path):
-                yield key, self._source_to_qa(example)
+            for key, example in self._generate_source_documents(input_json_file):
+                yield self._source_to_qa(example)
 
-    def _generate_source_documents(self, data_dir, filepath):
+    def _generate_source_documents(self, input_json_file: Path) -> Tuple[str, Dict]:
+        """Generates source instances."""
 
-        with open(filepath, encoding="utf-8") as f:
-            head_qa = json.load(f)
+        with input_json_file.open("r", encoding="utf8") as file_stream:
+            head_qa = json.load(file_stream)
 
         for exam_id, exam in enumerate(head_qa["exams"]):
             content = head_qa["exams"][exam]
             name = content["name"].strip()
             year = content["year"].strip()
             category = content["category"].strip()
+
             for question in content["data"]:
                 qid = int(question["qid"].strip())
                 qtext = question["qtext"].strip()
                 ra = int(question["ra"].strip())
-                image_path = question["image"].strip()
 
                 aids = [answer["aid"] for answer in question["answers"]]
                 atexts = [answer["atext"].strip() for answer in question["answers"]]
                 answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]
 
-                id_ = f"{exam_id}_{qid}"
-                yield id_, {
+                instance_id = f"{exam_id}_{qid}"
+                instance = {
                     "name": name,
                     "year": year,
                     "category": category,
                     "qid": qid,
                     "qtext": qtext,
                     "ra": ra,
-                    "image": os.path.join(data_dir, image_path) if image_path else None,
                     "answers": answers,
                 }
 
-    def _source_to_qa(self, example):
-        example_ = {}
-        example_["id"] = example["name"] + "_qid_" + str(example["qid"])
-        example_["question_id"] = example["qid"]
-        example_["document_id"] = ""
-        example_["question"] = example["qtext"]
-        example_["type"] = "multiple_choice"
-        example_["choices"] = [answer["atext"] for answer in example["answers"]]
-        example_["context"] = ""
-        example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]]
-
-        return example_
+                yield instance_id, instance
+
+    def _source_to_qa(self, example: Dict) -> Tuple[str, Dict]:
+        """Converts a source example to BigBio example."""
+
+        instance = {
+            "id": example["name"] + "_qid_" + str(example["qid"]),
+            "question_id": example["qid"],
+            "document_id": None,
+            "question": example["qtext"],
+            "type": "multiple_choice",
+            "choices": [answer["atext"] for answer in example["answers"]],
+            "context": None,
+            "answer": [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]],
+        }
+
+        return instance["id"], instance

From 35093d31d80b4abd3aa2bc8701cb1d1e983b2312 Mon Sep 17 00:00:00 2001
From: Florian Borchert <fl.borchert@gmail.com>
Date: Fri, 25 Oct 2024 10:07:01 +0200
Subject: [PATCH 4/4] Minor adjustments to Readme

---
 bigbio/hub/hub_repos/head_qa/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigbio/hub/hub_repos/head_qa/README.md b/bigbio/hub/hub_repos/head_qa/README.md
index a1f255b3..bc098a89 100644
--- a/bigbio/hub/hub_repos/head_qa/README.md
+++ b/bigbio/hub/hub_repos/head_qa/README.md
@@ -7,7 +7,7 @@ bigbio_language:
   - Spanish
 license: mit
 bigbio_license_shortname: MIT
-multilinguality: monolingual
+multilinguality: multilingual
 pretty_name: HEAD-QA
 homepage: https://aghie.github.io/head-qa/
 bigbio_pubmed: false
@@ -28,7 +28,7 @@ bigbio_tasks:
 
 HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the 
 Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the 
-Ministerio de Sanidad, Consumo y Bienestar Social.The dataset contains questions about following topics: medicine, 
+Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about following topics: medicine, 
 nursing, psychology, chemistry, pharmacology and biology.