From de3f6640dfd089068ebe1cbe24d04315ae7f56da Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 22 Apr 2022 12:59:36 +0200 Subject: [PATCH 1/4] Initial head qa dataset --- biodatasets/head_qa/head_qa.py | 349 +++++++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 biodatasets/head_qa/head_qa.py diff --git a/biodatasets/head_qa/head_qa.py b/biodatasets/head_qa/head_qa.py new file mode 100644 index 00000000..17d8183f --- /dev/null +++ b/biodatasets/head_qa/head_qa.py @@ -0,0 +1,349 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the +Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio +de Sanidad, Consumo y Bienestar Social. +The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, +pharmacology and biology. + +Original code: https://huggingface.co/datasets/head_qa/blob/main/head_qa.py +""" + +import json +import os +from typing import Dict, List, Tuple + +import datasets + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@inproceedings{vilares-gomez-rodriguez-2019-head, + title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning", + author = "Vilares, David and + G{\'o}mez-Rodr{\'i}guez, Carlos", + booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2019", + address = "Florence, Italy", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/P19-1092", + doi = "10.18653/v1/P19-1092", + pages = "960--966"} +""" + +_DATASETNAME = "head_qa" + +_DESCRIPTION = """\ +HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the +Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio +de Sanidad, Consumo y Bienestar Social. +The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, +pharmacology and biology. +""" + +_HOMEPAGE = "https://aghie.github.io/head-qa/" + +_LICENSE = "MIT License" + +_URLS = { + _DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t", +} + +_SUPPORTED_TASKS = [Tasks.TRANSLATION, Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + + +class HeadQADataset(datasets.GeneratorBasedBuilder): + """HEAD-QA: A Healthcare Dataset for Complex Reasoning""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="head_qa_source", + version=SOURCE_VERSION, + description="HeadQA both languages source schema", + schema="source", + subset_id="head_qa", + ), + BigBioConfig( + name="head_qa_en_source", + version=SOURCE_VERSION, + description="HeadQA English source schema", + schema="source", + subset_id="head_qa_en", + ), + BigBioConfig( + name="head_qa_es_source", + version=SOURCE_VERSION, + description="HeadQA Spanish source schema", + schema="source", + subset_id="head_qa_es", + ), + BigBioConfig( + name="head_qa_bigbio_t2t", + version=BIGBIO_VERSION, + description="HeadQA Translation BigBio schema", + schema="bigbio_t2t", + subset_id="head_qa", + ), + BigBioConfig( + name="head_qa_en_bigbio_qa", + version=BIGBIO_VERSION, + description="HeadQA English Question Answering BigBio schema", + schema="bigbio_qa", + subset_id="head_qa_en", + ), + BigBioConfig( + name="head_qa_es_bigbio_qa", + version=BIGBIO_VERSION, + description="HeadQA Spanish Question Answering BigBio schema", + schema="bigbio_qa", + subset_id="head_qa_es", + ), + ] + + DEFAULT_CONFIG_NAME = "head_qa_en_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source" and self.config.subset_id == "head_qa": + features = datasets.Features( + { + "name": datasets.Value("string"), + "year": datasets.Value("string"), + "category": datasets.Value("string"), + "qid": datasets.Value("int32"), + "qtext": { + "en": datasets.Value("string"), + "es": datasets.Value("string"), + }, + "ra": datasets.Value("int32"), + "image": datasets.Image(), + "answers": [ + { + "aid": datasets.Value("int32"), + "atext": { + "en": datasets.Value("string"), + "es": datasets.Value("string"), + }, + } + ], + } + ) + elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]: + features = datasets.Features( + { + "name": datasets.Value("string"), + "year": datasets.Value("string"), + "category": datasets.Value("string"), + "qid": datasets.Value("int32"), + "qtext": datasets.Value("string"), + "ra": datasets.Value("int32"), + "image": datasets.Image(), + "answers": [ + { + "aid": datasets.Value("int32"), + "atext": datasets.Value("string"), + } + ], + } + ) + elif self.config.schema == "bigbio_t2t": + features = schemas.text2text_features + elif self.config.schema == "bigbio_qa": + features = schemas.qa_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": data_dir, + "en_path": os.path.join(data_dir, "HEAD_EN", "train_HEAD_EN.json"), + "es_path": os.path.join(data_dir, "HEAD", "train_HEAD.json"), + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "data_dir": data_dir, + "en_path": os.path.join(data_dir, "HEAD_EN", "test_HEAD_EN.json"), + "es_path": os.path.join(data_dir, "HEAD", "test_HEAD.json"), + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "data_dir": data_dir, + "en_path": os.path.join(data_dir, "HEAD_EN", "dev_HEAD_EN.json"), + "es_path": os.path.join(data_dir, "HEAD", "dev_HEAD.json"), + }, + ), + ] + + def _generate_examples(self, data_dir, en_path, es_path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source" and self.config.subset_id == "head_qa": + for key, example in self._merge_documents( + self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path) + ): + yield key, example + + elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]: + if self.config.subset_id == "head_qa_en": + filepath = en_path + elif self.config.subset_id == "head_qa_es": + filepath = es_path + for key, example in self._generate_source_documents(data_dir, filepath): + yield key, example + + elif self.config.schema == "bigbio_t2t": + for key, example in self._merge_documents( + self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path) + ): + for key_t2t, example_t2t in self._generate_source_to_t2t(example): + yield key_t2t, example_t2t + + elif self.config.schema == "bigbio_qa": + if self.config.subset_id == "head_qa_en": + filepath = en_path + elif self.config.subset_id == "head_qa_es": + filepath = es_path + for key, example in self._generate_source_documents(data_dir, filepath): + yield key, self._source_to_qa(example) + + def _generate_source_documents(self, data_dir, filepath): + + with open(filepath, encoding="utf-8") as f: + head_qa = json.load(f) + + for exam_id, exam in enumerate(head_qa["exams"]): + content = head_qa["exams"][exam] + name = content["name"].strip() + year = content["year"].strip() + category = content["category"].strip() + for question in content["data"]: + qid = int(question["qid"].strip()) + qtext = question["qtext"].strip() + ra = int(question["ra"].strip()) + image_path = question["image"].strip() + + aids = [answer["aid"] for answer in question["answers"]] + atexts = [answer["atext"].strip() for answer in question["answers"]] + answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)] + + id_ = f"{exam_id}_{qid}" + yield id_, { + "name": name, + "year": year, + "category": category, + "qid": qid, + "qtext": qtext, + "ra": ra, + "image": os.path.join(data_dir, image_path) if image_path else None, + "answers": answers, + } + + def _merge_documents(self, gen_en, gen_es): + for (doc_en_id, doc_en), (doc_es_id, doc_es) in zip(gen_en, gen_es): + assert doc_en_id == doc_es_id, "ohno" + self._assert_eq_doc(doc_en, doc_es) + + doc_merge = doc_en.copy() + doc_merge["qtext"] = {"en": doc_en["qtext"], "es": doc_es["qtext"]} + answers = [] + for answer_en, answer_es in zip(doc_en["answers"], doc_es["answers"]): + assert answer_en["aid"] == answer_es["aid"], "ohno" + answers.append( + { + "aid": answer_en["aid"], + "atext": { + "en": answer_en["atext"], + "es": answer_es["atext"], + }, + } + ) + doc_merge["answers"] = answers + yield doc_en_id, doc_merge + + def _assert_eq_doc(self, doc1, doc2): + doc1 = doc1.copy() + doc2 = doc2.copy() + doc1.pop("qtext") + doc1.pop("answers") + doc2.pop("qtext") + doc2.pop("answers") + assert doc1 == doc2, f"ohno {doc1} {doc2}" + + def _source_to_qa(self, example): + example_ = {} + example_["id"] = example["name"] + "_qid_" + str(example["qid"]) + example_["question_id"] = example["qid"] + example_["document_id"] = "" + example_["question"] = example["qtext"] + example_["type"] = example["category"] + example_["choices"] = [answer["atext"] for answer in example["answers"]] + example_["context"] = "" + example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]] + + return example_ + + def _generate_source_to_t2t(self, example): + id = example["name"] + "_qid_" + str(example["qid"]) + example_ = { + "id": id, + "document_id": "", + "text_1": example["qtext"]["en"], + "text_2": example["qtext"]["es"], + "text_1_name": "en", + "text_2_name": "es", + } + yield id, example_ + + for answer in example["answers"]: + id = example["name"] + "_qid_" + str(example["qid"]) + "_aid_" + str(answer["aid"]) + example_ = { + "id": id, + "document_id": "", + "text_1": answer["atext"]["en"], + "text_2": answer["atext"]["es"], + "text_1_name": "en", + "text_2_name": "es", + } + yield id, example_ From d5d1ca50838c6e6bd810e6c843adaedd33cb5915 Mon Sep 17 00:00:00 2001 From: Simon Ott Date: Wed, 27 Apr 2022 09:38:13 +0200 Subject: [PATCH 2/4] Removed t2t schema --- biodatasets/head_qa/head_qa.py | 146 ++++----------------------------- 1 file changed, 15 insertions(+), 131 deletions(-) diff --git a/biodatasets/head_qa/head_qa.py b/biodatasets/head_qa/head_qa.py index 17d8183f..52abaecd 100644 --- a/biodatasets/head_qa/head_qa.py +++ b/biodatasets/head_qa/head_qa.py @@ -66,7 +66,7 @@ _DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t", } -_SUPPORTED_TASKS = [Tasks.TRANSLATION, Tasks.QUESTION_ANSWERING] +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] _SOURCE_VERSION = "1.0.0" @@ -80,13 +80,6 @@ class HeadQADataset(datasets.GeneratorBasedBuilder): BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) BUILDER_CONFIGS = [ - BigBioConfig( - name="head_qa_source", - version=SOURCE_VERSION, - description="HeadQA both languages source schema", - schema="source", - subset_id="head_qa", - ), BigBioConfig( name="head_qa_en_source", version=SOURCE_VERSION, @@ -101,13 +94,6 @@ class HeadQADataset(datasets.GeneratorBasedBuilder): schema="source", subset_id="head_qa_es", ), - BigBioConfig( - name="head_qa_bigbio_t2t", - version=BIGBIO_VERSION, - description="HeadQA Translation BigBio schema", - schema="bigbio_t2t", - subset_id="head_qa", - ), BigBioConfig( name="head_qa_en_bigbio_qa", version=BIGBIO_VERSION, @@ -127,32 +113,7 @@ class HeadQADataset(datasets.GeneratorBasedBuilder): DEFAULT_CONFIG_NAME = "head_qa_en_source" def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source" and self.config.subset_id == "head_qa": - features = datasets.Features( - { - "name": datasets.Value("string"), - "year": datasets.Value("string"), - "category": datasets.Value("string"), - "qid": datasets.Value("int32"), - "qtext": { - "en": datasets.Value("string"), - "es": datasets.Value("string"), - }, - "ra": datasets.Value("int32"), - "image": datasets.Image(), - "answers": [ - { - "aid": datasets.Value("int32"), - "atext": { - "en": datasets.Value("string"), - "es": datasets.Value("string"), - }, - } - ], - } - ) - elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]: + if self.config.schema == "source": features = datasets.Features( { "name": datasets.Value("string"), @@ -170,8 +131,6 @@ def _info(self) -> datasets.DatasetInfo: ], } ) - elif self.config.schema == "bigbio_t2t": - features = schemas.text2text_features elif self.config.schema == "bigbio_qa": features = schemas.qa_features @@ -189,63 +148,43 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: urls = _URLS[_DATASETNAME] data_dir = dl_manager.download_and_extract(urls) + if self.config.subset_id == "head_qa_en": + file_path = os.path.join("HEAD_EN", "train_HEAD_EN.json") + elif self.config.subset_id == "head_qa_es": + file_path = os.path.join("HEAD", "train_HEAD.json") + return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "data_dir": data_dir, - "en_path": os.path.join(data_dir, "HEAD_EN", "train_HEAD_EN.json"), - "es_path": os.path.join(data_dir, "HEAD", "train_HEAD.json"), + "file_path": os.path.join(data_dir, file_path), }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "data_dir": data_dir, - "en_path": os.path.join(data_dir, "HEAD_EN", "test_HEAD_EN.json"), - "es_path": os.path.join(data_dir, "HEAD", "test_HEAD.json"), + "file_path": os.path.join(data_dir, file_path), }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "data_dir": data_dir, - "en_path": os.path.join(data_dir, "HEAD_EN", "dev_HEAD_EN.json"), - "es_path": os.path.join(data_dir, "HEAD", "dev_HEAD.json"), + "file_path": os.path.join(data_dir, file_path), }, ), ] - def _generate_examples(self, data_dir, en_path, es_path) -> Tuple[int, Dict]: + def _generate_examples(self, data_dir, file_path) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - if self.config.schema == "source" and self.config.subset_id == "head_qa": - for key, example in self._merge_documents( - self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path) - ): - yield key, example - - elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]: - if self.config.subset_id == "head_qa_en": - filepath = en_path - elif self.config.subset_id == "head_qa_es": - filepath = es_path - for key, example in self._generate_source_documents(data_dir, filepath): + if self.config.schema == "source": + for key, example in self._generate_source_documents(data_dir, file_path): yield key, example - - elif self.config.schema == "bigbio_t2t": - for key, example in self._merge_documents( - self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path) - ): - for key_t2t, example_t2t in self._generate_source_to_t2t(example): - yield key_t2t, example_t2t - elif self.config.schema == "bigbio_qa": - if self.config.subset_id == "head_qa_en": - filepath = en_path - elif self.config.subset_id == "head_qa_es": - filepath = es_path - for key, example in self._generate_source_documents(data_dir, filepath): + for key, example in self._generate_source_documents(data_dir, file_path): yield key, self._source_to_qa(example) def _generate_source_documents(self, data_dir, filepath): @@ -280,70 +219,15 @@ def _generate_source_documents(self, data_dir, filepath): "answers": answers, } - def _merge_documents(self, gen_en, gen_es): - for (doc_en_id, doc_en), (doc_es_id, doc_es) in zip(gen_en, gen_es): - assert doc_en_id == doc_es_id, "ohno" - self._assert_eq_doc(doc_en, doc_es) - - doc_merge = doc_en.copy() - doc_merge["qtext"] = {"en": doc_en["qtext"], "es": doc_es["qtext"]} - answers = [] - for answer_en, answer_es in zip(doc_en["answers"], doc_es["answers"]): - assert answer_en["aid"] == answer_es["aid"], "ohno" - answers.append( - { - "aid": answer_en["aid"], - "atext": { - "en": answer_en["atext"], - "es": answer_es["atext"], - }, - } - ) - doc_merge["answers"] = answers - yield doc_en_id, doc_merge - - def _assert_eq_doc(self, doc1, doc2): - doc1 = doc1.copy() - doc2 = doc2.copy() - doc1.pop("qtext") - doc1.pop("answers") - doc2.pop("qtext") - doc2.pop("answers") - assert doc1 == doc2, f"ohno {doc1} {doc2}" - def _source_to_qa(self, example): example_ = {} example_["id"] = example["name"] + "_qid_" + str(example["qid"]) example_["question_id"] = example["qid"] example_["document_id"] = "" example_["question"] = example["qtext"] - example_["type"] = example["category"] + example_["type"] = "multiple_choice" example_["choices"] = [answer["atext"] for answer in example["answers"]] example_["context"] = "" example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]] return example_ - - def _generate_source_to_t2t(self, example): - id = example["name"] + "_qid_" + str(example["qid"]) - example_ = { - "id": id, - "document_id": "", - "text_1": example["qtext"]["en"], - "text_2": example["qtext"]["es"], - "text_1_name": "en", - "text_2_name": "es", - } - yield id, example_ - - for answer in example["answers"]: - id = example["name"] + "_qid_" + str(example["qid"]) + "_aid_" + str(answer["aid"]) - example_ = { - "id": id, - "document_id": "", - "text_1": answer["atext"]["en"], - "text_2": answer["atext"]["es"], - "text_1_name": "en", - "text_2_name": "es", - } - yield id, example_ From 251a0695450cf194f363ce55af0c48307ebb0cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Thu, 24 Oct 2024 18:01:45 +0200 Subject: [PATCH 3/4] refactor: Refactor HEAD-QA dataset implementation to hub-based schema --- bigbio/hub/hub_repos/head_qa/README.md | 50 ++ bigbio/hub/hub_repos/head_qa/bigbiohub.py | 590 ++++++++++++++++++ .../hub/hub_repos}/head_qa/head_qa.py | 127 ++-- 3 files changed, 702 insertions(+), 65 deletions(-) create mode 100644 bigbio/hub/hub_repos/head_qa/README.md create mode 100644 bigbio/hub/hub_repos/head_qa/bigbiohub.py rename {biodatasets => bigbio/hub/hub_repos}/head_qa/head_qa.py (66%) diff --git a/bigbio/hub/hub_repos/head_qa/README.md b/bigbio/hub/hub_repos/head_qa/README.md new file mode 100644 index 00000000..a1f255b3 --- /dev/null +++ b/bigbio/hub/hub_repos/head_qa/README.md @@ -0,0 +1,50 @@ +--- +language: + - es + - en +bigbio_language: + - English + - Spanish +license: mit +bigbio_license_shortname: MIT +multilinguality: monolingual +pretty_name: HEAD-QA +homepage: https://aghie.github.io/head-qa/ +bigbio_pubmed: false +bigbio_public: true +bigbio_tasks: + - QUESTION_ANSWERING +--- + + +# Dataset Card for HEAD-QA + +## Dataset Description + +- **Homepage:** https://aghie.github.io/head-qa/ +- **Pubmed:** False +- **Public:** True +- **Tasks:** QA + +HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the +Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the +Ministerio de Sanidad, Consumo y Bienestar Social.The dataset contains questions about following topics: medicine, +nursing, psychology, chemistry, pharmacology and biology. + + +## Citation Information + +``` +@inproceedings{vilares-gomez-rodriguez-2019-head, + title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning", + author = "Vilares, David and G{\'o}mez-Rodr{\'i}guez, Carlos", + booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2019", + address = "Florence, Italy", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/P19-1092", + doi = "10.18653/v1/P19-1092", + pages = "960--966" +} +``` diff --git a/bigbio/hub/hub_repos/head_qa/bigbiohub.py b/bigbio/hub/hub_repos/head_qa/bigbiohub.py new file mode 100644 index 00000000..f4da7bb7 --- /dev/null +++ b/bigbio/hub/hub_repos/head_qa/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/biodatasets/head_qa/head_qa.py b/bigbio/hub/hub_repos/head_qa/head_qa.py similarity index 66% rename from biodatasets/head_qa/head_qa.py rename to bigbio/hub/hub_repos/head_qa/head_qa.py index 52abaecd..ee8d3283 100644 --- a/biodatasets/head_qa/head_qa.py +++ b/bigbio/hub/hub_repos/head_qa/head_qa.py @@ -13,31 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the -Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio -de Sanidad, Consumo y Bienestar Social. -The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, -pharmacology and biology. - -Original code: https://huggingface.co/datasets/head_qa/blob/main/head_qa.py -""" - import json -import os +from pathlib import Path from typing import Dict, List, Tuple import datasets -from utils import schemas -from utils.configs import BigBioConfig -from utils.constants import Tasks +from .bigbiohub import BigBioConfig, Tasks, qa_features + +_LANGUAGES = ["English", "Spanish"] +_LICENSE = "MIT" +_LOCAL = False +_PUBMED = False _CITATION = """\ @inproceedings{vilares-gomez-rodriguez-2019-head, title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning", - author = "Vilares, David and - G{\'o}mez-Rodr{\'i}guez, Carlos", + author = "Vilares, David and G{\'o}mez-Rodr{\'i}guez, Carlos", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", @@ -45,31 +37,30 @@ publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P19-1092", doi = "10.18653/v1/P19-1092", - pages = "960--966"} + pages = "960--966" +} """ _DATASETNAME = "head_qa" +_DISPLAYNAME = "HEAD-QA" _DESCRIPTION = """\ -HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the -Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio -de Sanidad, Consumo y Bienestar Social. -The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, -pharmacology and biology. +HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the \ +Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the \ +Ministerio de Sanidad, Consumo y Bienestar Social.The dataset contains questions about following topics: medicine, \ +nursing, psychology, chemistry, pharmacology and biology. """ _HOMEPAGE = "https://aghie.github.io/head-qa/" -_LICENSE = "MIT License" - _URLS = { - _DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t", + "HEAD": "https://drive.usercontent.google.com/u/0/uc?id=1dUIqVwvoZAtbX_-z5axCoe97XNcFo1No&export=download", + "HEAD_EN": "https://drive.usercontent.google.com/u/0/uc?id=1phryJg4FjCFkn0mSCqIOP2-FscAeKGV0&export=download", } _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] _SOURCE_VERSION = "1.0.0" - _BIGBIO_VERSION = "1.0.0" @@ -122,7 +113,6 @@ def _info(self) -> datasets.DatasetInfo: "qid": datasets.Value("int32"), "qtext": datasets.Value("string"), "ra": datasets.Value("int32"), - "image": datasets.Image(), "answers": [ { "aid": datasets.Value("int32"), @@ -132,7 +122,9 @@ def _info(self) -> datasets.DatasetInfo: } ) elif self.config.schema == "bigbio_qa": - features = schemas.qa_features + features = qa_features + else: + raise NotImplementedError(f"Schema {self.config.schema} is not supported") return datasets.DatasetInfo( description=_DESCRIPTION, @@ -144,90 +136,95 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - - urls = _URLS[_DATASETNAME] - data_dir = dl_manager.download_and_extract(urls) - if self.config.subset_id == "head_qa_en": - file_path = os.path.join("HEAD_EN", "train_HEAD_EN.json") + data_dir = Path(dl_manager.download_and_extract(_URLS["HEAD_EN"])) / "HEAD_EN" + subset_name = "HEAD_EN" + elif self.config.subset_id == "head_qa_es": - file_path = os.path.join("HEAD", "train_HEAD.json") + data_dir = Path(dl_manager.download_and_extract(_URLS["HEAD"])) / "HEAD" + subset_name = "HEAD" + + else: + raise NotImplementedError(f"Subset {self.config.subset_id} is not supported") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ - "data_dir": data_dir, - "file_path": os.path.join(data_dir, file_path), + "input_json_file": data_dir / f"train_{subset_name}.json", }, ), datasets.SplitGenerator( - name=datasets.Split.TEST, + name=datasets.Split.VALIDATION, gen_kwargs={ - "data_dir": data_dir, - "file_path": os.path.join(data_dir, file_path), + "input_json_file": data_dir / f"dev_{subset_name}.json", }, ), datasets.SplitGenerator( - name=datasets.Split.VALIDATION, + name=datasets.Split.TEST, gen_kwargs={ - "data_dir": data_dir, - "file_path": os.path.join(data_dir, file_path), + "input_json_file": data_dir / f"test_{subset_name}.json", }, ), ] - def _generate_examples(self, data_dir, file_path) -> Tuple[int, Dict]: + def _generate_examples(self, input_json_file: Path) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" if self.config.schema == "source": - for key, example in self._generate_source_documents(data_dir, file_path): + for key, example in self._generate_source_documents(input_json_file): yield key, example + elif self.config.schema == "bigbio_qa": - for key, example in self._generate_source_documents(data_dir, file_path): - yield key, self._source_to_qa(example) + for key, example in self._generate_source_documents(input_json_file): + yield self._source_to_qa(example) - def _generate_source_documents(self, data_dir, filepath): + def _generate_source_documents(self, input_json_file: Path) -> Tuple[str, Dict]: + """Generates source instances.""" - with open(filepath, encoding="utf-8") as f: - head_qa = json.load(f) + with input_json_file.open("r", encoding="utf8") as file_stream: + head_qa = json.load(file_stream) for exam_id, exam in enumerate(head_qa["exams"]): content = head_qa["exams"][exam] name = content["name"].strip() year = content["year"].strip() category = content["category"].strip() + for question in content["data"]: qid = int(question["qid"].strip()) qtext = question["qtext"].strip() ra = int(question["ra"].strip()) - image_path = question["image"].strip() aids = [answer["aid"] for answer in question["answers"]] atexts = [answer["atext"].strip() for answer in question["answers"]] answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)] - id_ = f"{exam_id}_{qid}" - yield id_, { + instance_id = f"{exam_id}_{qid}" + instance = { "name": name, "year": year, "category": category, "qid": qid, "qtext": qtext, "ra": ra, - "image": os.path.join(data_dir, image_path) if image_path else None, "answers": answers, } - def _source_to_qa(self, example): - example_ = {} - example_["id"] = example["name"] + "_qid_" + str(example["qid"]) - example_["question_id"] = example["qid"] - example_["document_id"] = "" - example_["question"] = example["qtext"] - example_["type"] = "multiple_choice" - example_["choices"] = [answer["atext"] for answer in example["answers"]] - example_["context"] = "" - example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]] - - return example_ + yield instance_id, instance + + def _source_to_qa(self, example: Dict) -> Tuple[str, Dict]: + """Converts a source example to BigBio example.""" + + instance = { + "id": example["name"] + "_qid_" + str(example["qid"]), + "question_id": example["qid"], + "document_id": None, + "question": example["qtext"], + "type": "multiple_choice", + "choices": [answer["atext"] for answer in example["answers"]], + "context": None, + "answer": [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]], + } + + return instance["id"], instance From 35093d31d80b4abd3aa2bc8701cb1d1e983b2312 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 25 Oct 2024 10:07:01 +0200 Subject: [PATCH 4/4] Minor adjustments to Readme --- bigbio/hub/hub_repos/head_qa/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigbio/hub/hub_repos/head_qa/README.md b/bigbio/hub/hub_repos/head_qa/README.md index a1f255b3..bc098a89 100644 --- a/bigbio/hub/hub_repos/head_qa/README.md +++ b/bigbio/hub/hub_repos/head_qa/README.md @@ -7,7 +7,7 @@ bigbio_language: - Spanish license: mit bigbio_license_shortname: MIT -multilinguality: monolingual +multilinguality: multilingual pretty_name: HEAD-QA homepage: https://aghie.github.io/head-qa/ bigbio_pubmed: false @@ -28,7 +28,7 @@ bigbio_tasks: HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the -Ministerio de Sanidad, Consumo y Bienestar Social.The dataset contains questions about following topics: medicine, +Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.