From 66090e3fa59c3a03fd4728c30c993e1ae9046cc7 Mon Sep 17 00:00:00 2001
From: Shubhanshu Mishra <smishra8@illinois.edu>
Date: Mon, 11 Apr 2022 16:26:36 -0500
Subject: [PATCH 1/6] Fixes #113 - Add Chebi (Chapti)

---
 biodatasets/chebi/chebi.py | 271 +++++++++++++++++++++++++++++++++++++
 1 file changed, 271 insertions(+)
 create mode 100644 biodatasets/chebi/chebi.py

diff --git a/biodatasets/chebi/chebi.py b/biodatasets/chebi/chebi.py
new file mode 100644
index 00000000..4f5c0166
--- /dev/null
+++ b/biodatasets/chebi/chebi.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo.
+
+When modifying it for your dataset, look for TODO items that offer specific instructions.
+
+Full documentation on writing dataset loading scripts can be found here:
+https://huggingface.co/docs/datasets/add_dataset.html
+
+To create a dataset loading script you will create a class and implement 3 methods:
+  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
+  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
+  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
+
+TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
+
+[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+"""
+
+import os
+from typing import List, Tuple, Dict
+
+import datasets
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@article{,
+  author    = {},
+  title     = {},
+  journal   = {},
+  volume    = {},
+  year      = {},
+  url       = {},
+  doi       = {},
+  biburl    = {},
+  bibsource = {}
+}
+"""
+
+# TODO: create a module level variable with your dataset name (should match script name)
+#  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
+_DATASETNAME = "[dataset_name]"
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This dataset is designed for XXX NLP task.
+"""
+
+# TODO: Add a link to an official homepage for the dataset here (if possible)
+_HOMEPAGE = ""
+
+# TODO: Add the licence for the dataset here (if possible)
+# Note that this doesn't have to be a common open source license.
+# Some datasets have custom licenses. In this case, simply put the full license terms
+# into `_LICENSE`
+_LICENSE = ""
+
+# TODO: Add links to the urls needed to download your dataset files.
+#  For local datasets, this variable can be an empty dictionary.
+
+# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
+# In most cases the URLs will be the same for the source and bigbio config.
+# However, if you need to access different files for each config you can have multiple entries in this dict.
+# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
+_URLS = {
+    _DATASETNAME: "url or list of urls or ... ",
+}
+
+# TODO: add supported task by dataset. One dataset may support multiple tasks
+_SUPPORTED_TASKS = []  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+
+# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
+#  This version doesn't have to be consistent with semantic versioning. Anything that is
+#  provided by the original dataset as a version goes.
+_SOURCE_VERSION = ""
+
+_BIGBIO_VERSION = "1.0.0"
+
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
+class NewDataset(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    # You will be able to load the "source" or "bigbio" configurations with
+    # ds_source = datasets.load_dataset('my_dataset', name='source')
+    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio')
+
+    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
+    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
+    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
+    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files")
+
+    # TODO: For each dataset, implement Config for Source and BigBio;
+    #  If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them.
+    #  Each of them should contain:
+    #   - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name]
+    #   - version: option = (SOURCE_VERSION|BIGBIO_VERSION)
+    #   - description: one line description for the dataset
+    #   - schema: options = (source|bigbio_[bigbio_schema_name])
+    #   - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b)
+    #  where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="[dataset_name]_source",
+            version=SOURCE_VERSION,
+            description="[dataset_name] source schema",
+            schema="source",
+            subset_id="[dataset_name]",
+        ),
+        BigBioConfig(
+            name="[dataset_name]_bigbio_[bigbio_schema_name]",
+            version=BIGBIO_VERSION,
+            description="[dataset_name] BigBio schema",
+            schema="bigbio_[bigbio_schema_name]",
+            subset_id="[dataset_name]",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "[dataset_name]_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
+
+        # You can arbitrarily nest lists and dictionaries.
+        # For iterables, use lists over tuples or `datasets.Sequence`
+
+        if self.config.schema == "source":
+            # TODO: Create your source schema here
+            raise NotImplementedError()
+
+            # EX: Arbitrary NER type dataset
+            # features = datasets.Features(
+            #    {
+            #        "doc_id": datasets.Value("string"),
+            #        "text": datasets.Value("string"),
+            #        "entities": [
+            #            {
+            #                "offsets": [datasets.Value("int64")],
+            #                "text": datasets.Value("string"),
+            #                "type": datasets.Value("string"),
+            #                "entity_id": datasets.Value("string"),
+            #            }
+            #        ],
+            #    }
+            # )
+
+        # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide.
+
+        # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format.
+
+        # For example bigbio_kb, bigbio_t2t
+        elif self.config.schema == "bigbio_[bigbio_schema_name]":
+            # e.g. features = schemas.kb_features
+            # TODO: Choose your big-bio schema here
+            raise NotImplementedError()
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+
+        # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name
+
+        # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath
+
+        # PUBLIC DATASETS: Assign your data-dir based on the dl_manager.
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
+
+        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
+
+        # TODO: KEEP if your dataset is PUBLIC; remove if not
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        # TODO: KEEP if your dataset is LOCAL; remove if NOT
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        else:
+            data_dir = self.config.data_dir
+
+        # Not all datasets have predefined canonical train/val/test splits.
+        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "train.jsonl"),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "test.jsonl"),
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "dev.jsonl"),
+                    "split": "dev",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+
+    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
+
+    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
+
+        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
+
+        # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
+
+        if self.config.schema == "source":
+            # TODO: yield (key, example) tuples in the original dataset schema
+            for key, example in thing:
+                yield key, example
+
+        elif self.config.schema == "bigbio_[bigbio_schema_name]":
+            # TODO: yield (key, example) tuples in the bigbio schema
+            for key, example in thing:
+                yield key, example
+
+
+# This template is based on the following template from the datasets package:
+# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
+
+
+# This allows you to run your dataloader with `python [dataset_name].py` during development
+# TODO: Remove this before making your PR
+if __name__ == "__main__":
+    datasets.load_dataset(__file__)

From 66a2609910d3144c5b6ea94420e4204e411308e8 Mon Sep 17 00:00:00 2001
From: Shubhanshu Mishra <shubhanshumishra@gmail.com>
Date: Tue, 12 Apr 2022 10:18:46 -0500
Subject: [PATCH 2/6] Updated info. Waiting on questions.

---
 biodatasets/chebi/chebi.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/biodatasets/chebi/chebi.py b/biodatasets/chebi/chebi.py
index 4f5c0166..234cda27 100644
--- a/biodatasets/chebi/chebi.py
+++ b/biodatasets/chebi/chebi.py
@@ -56,22 +56,24 @@
 
 # TODO: create a module level variable with your dataset name (should match script name)
 #  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
-_DATASETNAME = "[dataset_name]"
+_DATASETNAME = "chebi"
 
 # TODO: Add description of the dataset here
 # You can copy an official description
 _DESCRIPTION = """\
-This dataset is designed for XXX NLP task.
+This dataset contains biomedical named entities in patents. 
+It is sourced from the project Chemical Entities of Biological Interest (ChEBI), which is a freely available database of chemical compounds and other small molecular entities. The dataset is the Chapti dataset of ChEBI - http://chebi.cvs.sourceforge.net/viewvc/chebi/chapati/ 
+
 """
 
 # TODO: Add a link to an official homepage for the dataset here (if possible)
-_HOMEPAGE = ""
+_HOMEPAGE = "http://chebi.cvs.sourceforge.net/viewvc/chebi/chapati/"
 
 # TODO: Add the licence for the dataset here (if possible)
 # Note that this doesn't have to be a common open source license.
 # Some datasets have custom licenses. In this case, simply put the full license terms
 # into `_LICENSE`
-_LICENSE = ""
+_LICENSE = "Creative Commons"
 
 # TODO: Add links to the urls needed to download your dataset files.
 #  For local datasets, this variable can be an empty dictionary.
@@ -85,12 +87,15 @@
 }
 
 # TODO: add supported task by dataset. One dataset may support multiple tasks
-_SUPPORTED_TASKS = []  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+_SUPPORTED_TASKS = [
+    Tasks.NAMED_ENTITY_RECOGNITION,
+    Tasks.NAMED_ENTITY_DISAMBIGUATION,
+]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
 
 # TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
 #  This version doesn't have to be consistent with semantic versioning. Anything that is
 #  provided by the original dataset as a version goes.
-_SOURCE_VERSION = ""
+_SOURCE_VERSION = "1.0.0"
 
 _BIGBIO_VERSION = "1.0.0"
 
@@ -206,7 +211,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
 
         # TODO: KEEP if your dataset is LOCAL; remove if NOT
         if self.config.data_dir is None:
-            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+            raise ValueError(
+                "This is a local dataset. Please pass the data_dir kwarg to load_dataset."
+            )
         else:
             data_dir = self.config.data_dir
 

From 6afa559945e79e8e64a70b2280e2963bc346296d Mon Sep 17 00:00:00 2001
From: Shubhanshu Mishra <shubhanshumishra@gmail.com>
Date: Thu, 28 Apr 2022 08:33:06 -0500
Subject: [PATCH 3/6] Added working code for ChEBI

---
 biodatasets/chebi/chebi.py | 333 +++++++++++++++++--------------------
 1 file changed, 155 insertions(+), 178 deletions(-)

diff --git a/biodatasets/chebi/chebi.py b/biodatasets/chebi/chebi.py
index 234cda27..0b47b29d 100644
--- a/biodatasets/chebi/chebi.py
+++ b/biodatasets/chebi/chebi.py
@@ -14,174 +14,128 @@
 # limitations under the License.
 
 """
-This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo.
-
-When modifying it for your dataset, look for TODO items that offer specific instructions.
-
-Full documentation on writing dataset loading scripts can be found here:
-https://huggingface.co/docs/datasets/add_dataset.html
-
-To create a dataset loading script you will create a class and implement 3 methods:
-  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
-  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
-  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
-
-TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
-
-[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
+ChEBI Chapti contains the results of a collaboration between the European Patent Office and
+the ChEBI team. The goal of the project was to identify chemicals within patents and cross-
+reference them to ChEBI. The teams manually annotated chemicals in a set of 40 patents.
+This was used to measure the performance of the various text-mining tools. This set of
+40 patents is distributed in this directory. The results of this work can be seen on the ChEBI
+website.
+ 
+This work is distributed under the Creative Commons license: http://creativecommons.org/licenses/by/3.0/
 """
 
 import os
-from typing import List, Tuple, Dict
+from pathlib import Path
+from typing import Any, List, Tuple, Dict
 
 import datasets
+from lxml import etree
 from utils import schemas
 from utils.configs import BigBioConfig
 from utils.constants import Tasks
 
-# TODO: Add BibTeX citation
 _CITATION = """\
 @article{,
-  author    = {},
-  title     = {},
-  journal   = {},
-  volume    = {},
-  year      = {},
-  url       = {},
-  doi       = {},
-  biburl    = {},
-  bibsource = {}
+    title = {ChEBI: a database and ontology for chemical entities of biological interest},
+	author = {Degtyarenko, Kirill and de Matos, Paula and Ennis, Marcus and Hastings, Janna and Zbinden, Martin and McNaught, Alan and Alcántara, Rafael and Darsow, Michael and Guedj, Mickaël and Ashburner, Michael},
+	doi = {10.1093/nar/gkm791},
+	number = {Database issue},
+	volume = {36},
+	month = {January},
+	year = {2008},
+	journal = {Nucleic acids research},
+	issn = {0305-1048},
+	pages = {D344—50},
+	url = {https://europepmc.org/articles/PMC2238832},
+    biburl    = {https://aclanthology.org/W19-5008.bib},
+    bibsource = {https://aclanthology.org/W19-5008/}
 }
 """
 
-# TODO: create a module level variable with your dataset name (should match script name)
-#  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
 _DATASETNAME = "chebi"
 
-# TODO: Add description of the dataset here
-# You can copy an official description
 _DESCRIPTION = """\
-This dataset contains biomedical named entities in patents. 
-It is sourced from the project Chemical Entities of Biological Interest (ChEBI), which is a freely available database of chemical compounds and other small molecular entities. The dataset is the Chapti dataset of ChEBI - http://chebi.cvs.sourceforge.net/viewvc/chebi/chapati/ 
-
+ChEBI Chapti contains the results of a collaboration between the European Patent Office and
+the ChEBI team. The goal of the project was to identify chemicals within patents and cross-
+reference them to ChEBI. The teams manually annotated chemicals in a set of 40 patents.
+This was used to measure the performance of the various text-mining tools. This set of
+40 patents is distributed in this directory. The results of this work can be seen on the ChEBI
+website.
 """
 
-# TODO: Add a link to an official homepage for the dataset here (if possible)
 _HOMEPAGE = "http://chebi.cvs.sourceforge.net/viewvc/chebi/chapati/"
 
-# TODO: Add the licence for the dataset here (if possible)
-# Note that this doesn't have to be a common open source license.
-# Some datasets have custom licenses. In this case, simply put the full license terms
-# into `_LICENSE`
-_LICENSE = "Creative Commons"
-
-# TODO: Add links to the urls needed to download your dataset files.
-#  For local datasets, this variable can be an empty dictionary.
+_LICENSE = "Creative Commons License Attribution-ShareAlike 4.0 International"
 
-# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
-# In most cases the URLs will be the same for the source and bigbio config.
-# However, if you need to access different files for each config you can have multiple entries in this dict.
-# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
 _URLS = {
-    _DATASETNAME: "url or list of urls or ... ",
+    # The original dataset is hosted on CVS on sourceforge. Hence I have downloaded and reuploded it as tar.gz format.
+    # Converted via the following command:
+    # cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/chebi co chapati/patentsGoldStandard/PatentAnnotations_GoldStandard.tgz
+    # mkdir -p ./MoNERo
+    # pushd ./MoNERo && 7z x ../MoNERo_2019.7z && popd
+    # tar -czf MoNERo.tar.gz ./MoNERo
+    _DATASETNAME: "https://github.com/bigscience-workshop/biomedical/files/8568960/PatentAnnotations_GoldStandard.tar.gz",
 }
 
-# TODO: add supported task by dataset. One dataset may support multiple tasks
-_SUPPORTED_TASKS = [
-    Tasks.NAMED_ENTITY_RECOGNITION,
-    Tasks.NAMED_ENTITY_DISAMBIGUATION,
-]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION]
 
-# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
-#  This version doesn't have to be consistent with semantic versioning. Anything that is
-#  provided by the original dataset as a version goes.
 _SOURCE_VERSION = "1.0.0"
 
 _BIGBIO_VERSION = "1.0.0"
 
 
-# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
-#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
-class NewDataset(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
+class ChebiDataset(datasets.GeneratorBasedBuilder):
+    """ChEBI Chapti: Patents dataset for NER and Entity Linking."""
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
 
-    # You will be able to load the "source" or "bigbio" configurations with
-    # ds_source = datasets.load_dataset('my_dataset', name='source')
-    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio')
-
-    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
-    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
-    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
-    # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files")
-
-    # TODO: For each dataset, implement Config for Source and BigBio;
-    #  If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them.
-    #  Each of them should contain:
-    #   - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name]
-    #   - version: option = (SOURCE_VERSION|BIGBIO_VERSION)
-    #   - description: one line description for the dataset
-    #   - schema: options = (source|bigbio_[bigbio_schema_name])
-    #   - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b)
-    #  where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment)
-
     BUILDER_CONFIGS = [
         BigBioConfig(
-            name="[dataset_name]_source",
+            name=f"{_DATASETNAME}_source",
             version=SOURCE_VERSION,
-            description="[dataset_name] source schema",
+            description=f"{_DATASETNAME} source schema",
             schema="source",
-            subset_id="[dataset_name]",
+            subset_id=f"{_DATASETNAME}",
         ),
         BigBioConfig(
-            name="[dataset_name]_bigbio_[bigbio_schema_name]",
+            name=f"{_DATASETNAME}_bigbio_kb",
             version=BIGBIO_VERSION,
-            description="[dataset_name] BigBio schema",
-            schema="bigbio_[bigbio_schema_name]",
-            subset_id="[dataset_name]",
+            description=f"{_DATASETNAME} BigBio schema",
+            schema="bigbio_kb",
+            subset_id=f"{_DATASETNAME}",
         ),
     ]
 
-    DEFAULT_CONFIG_NAME = "[dataset_name]_source"
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
 
     def _info(self) -> datasets.DatasetInfo:
-
-        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
-
-        # You can arbitrarily nest lists and dictionaries.
-        # For iterables, use lists over tuples or `datasets.Sequence`
-
         if self.config.schema == "source":
-            # TODO: Create your source schema here
-            raise NotImplementedError()
-
-            # EX: Arbitrary NER type dataset
-            # features = datasets.Features(
-            #    {
-            #        "doc_id": datasets.Value("string"),
-            #        "text": datasets.Value("string"),
-            #        "entities": [
-            #            {
-            #                "offsets": [datasets.Value("int64")],
-            #                "text": datasets.Value("string"),
-            #                "type": datasets.Value("string"),
-            #                "entity_id": datasets.Value("string"),
-            #            }
-            #        ],
-            #    }
-            # )
-
-        # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide.
-
-        # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format.
+            features = datasets.Features(
+                {
+                    "doc_id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                    "entities": [
+                        {
+                            "phrase": datasets.Value("string"),
+                            "start": datasets.Value("int64"),
+                            "end": datasets.Value("int64"),
+                            "attrs": {
+                                "chebi-id": datasets.Value("string"),
+                                "comment": datasets.Value("string"),
+                                "epochem-id": datasets.Value("string"),
+                                "id": datasets.Value("string"),
+                                "name": datasets.Value("string"),
+                                "relevant": datasets.Value("string"),
+                                "type": datasets.Value("string"),
+                            },
+                        }
+                    ],
+                }
+            )
 
-        # For example bigbio_kb, bigbio_t2t
-        elif self.config.schema == "bigbio_[bigbio_schema_name]":
-            # e.g. features = schemas.kb_features
-            # TODO: Choose your big-bio schema here
-            raise NotImplementedError()
+        elif self.config.schema == "bigbio_kb":
+            features = schemas.kb_features
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -193,86 +147,109 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-
-        # If you need to access the "source" or "bigbio" config choice, that will be in self.config.name
-
-        # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath
-
-        # PUBLIC DATASETS: Assign your data-dir based on the dl_manager.
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
-
-        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
-
-        # TODO: KEEP if your dataset is PUBLIC; remove if not
         urls = _URLS[_DATASETNAME]
         data_dir = dl_manager.download_and_extract(urls)
-
-        # TODO: KEEP if your dataset is LOCAL; remove if NOT
-        if self.config.data_dir is None:
-            raise ValueError(
-                "This is a local dataset. Please pass the data_dir kwarg to load_dataset."
-            )
-        else:
-            data_dir = self.config.data_dir
-
-        # Not all datasets have predefined canonical train/val/test splits.
-        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
+        data_dir = Path(data_dir) / "scrapbook"
+        file_paths = list(data_dir.glob("./*/source.xml"))
+        print(len(file_paths))
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 # Whatever you put in gen_kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(data_dir, "train.jsonl"),
+                    "file_paths": file_paths,
                     "split": "train",
                 },
             ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "test.jsonl"),
-                    "split": "test",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "dev.jsonl"),
-                    "split": "dev",
-                },
-            ),
         ]
 
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-
-    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
-
-    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
+    def _generate_examples(self, file_paths, split: str) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
-        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
-
-        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
-
-        # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
 
         if self.config.schema == "source":
-            # TODO: yield (key, example) tuples in the original dataset schema
-            for key, example in thing:
+            for filepath in file_paths:
+                key, example = self._read_example_from_file(filepath)
                 yield key, example
 
-        elif self.config.schema == "bigbio_[bigbio_schema_name]":
-            # TODO: yield (key, example) tuples in the bigbio schema
-            for key, example in thing:
+        elif self.config.schema == "bigbio_kb":
+            for filepath in file_paths:
+                key, example = self._read_example_from_file_in_kb_schema(filepath)
                 yield key, example
 
-
-# This template is based on the following template from the datasets package:
-# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
+    def _parse_paragraph(self, para, start=0):
+        para_text = []
+        entities = []
+        for e in para.iter():
+            para_text.append(e.text)
+            if e.tag == "ne":
+                entity = {
+                    "phrase": e.text,
+                    "start": start,
+                    "end": start + len(e.text),
+                    "attrs": dict(e.attrib),
+                }
+                entities.append(entity)
+            start += len(e.text)
+            if e.tail:
+                para_text.append(e.tail)
+                start += len(e.tail)
+        return "".join(para_text), entities
+
+    def _read_example_from_file(self, filepath: Path) -> Tuple[str, Dict]:
+        with open(filepath, encoding="utf-8") as fp:
+            xml = etree.fromstring(fp.read().encode("utf-8"))
+        key = filepath.parent.name
+        document_text = []
+        entities = []
+        start = 0
+        for para in xml.iter("P", "p"):
+            # print(para.text)
+            para_text, para_entities = self._parse_paragraph(para, start=start)
+            document_text.append(para_text)
+            start += len(para_text)
+            entities.extend(para_entities)
+        document_text = "".join(document_text)
+        example = {"doc_id": key, "text": document_text, "entities": entities}
+        return key, example
+
+    def _parse_example_to_kb_schema(self, example) -> Dict[str, Any]:
+        text = example["text"]
+        doc_id = example["doc_id"]
+        passages = [
+            {
+                "id": f"{doc_id}-P0",
+                "type": "abstract",
+                "text": [text],
+                "offsets": [[0, len(text)]],
+            }
+        ]
+        entities = []
+        for i, e in enumerate(example["entities"]):
+            entity = {
+                "id": f"{doc_id}-E{i}",
+                "text": [e["phrase"]],
+                "offsets": [[e["start"], e["end"]]],
+                "type": e["attrs"]["type"],
+                "normalized": [{"db_name": "chebi", "db_id": e["attrs"]["chebi-id"]}],
+            }
+            entities.append(entity)
+        data = {
+            "id": doc_id,
+            "document_id": doc_id,
+            "passages": passages,
+            "entities": entities,
+            "relations": [],
+            "events": [],
+            "coreferences": [],
+        }
+        return data
+
+    def _read_example_from_file_in_kb_schema(self, filepath: Path) -> Tuple[str, Dict]:
+        key, example = self._read_example_from_file(filepath)
+        example = self._parse_example_to_kb_schema(example)
+        return key, example
 
 
-# This allows you to run your dataloader with `python [dataset_name].py` during development
-# TODO: Remove this before making your PR
 if __name__ == "__main__":
     datasets.load_dataset(__file__)

From b4cf675b23724e47521966668a8ebc1f551ddf04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Sat, 26 Oct 2024 09:31:48 +0200
Subject: [PATCH 4/6] refactor: Refactor implementation of chebi corpus to new
 hub-style

---
 bigbio/hub/hub_repos/chebi/README.md          |  43 ++
 bigbio/hub/hub_repos/chebi/bigbiohub.py       | 592 ++++++++++++++++++
 .../hub/hub_repos}/chebi/chebi.py             |  82 ++-
 3 files changed, 672 insertions(+), 45 deletions(-)
 create mode 100644 bigbio/hub/hub_repos/chebi/README.md
 create mode 100644 bigbio/hub/hub_repos/chebi/bigbiohub.py
 rename {biodatasets => bigbio/hub/hub_repos}/chebi/chebi.py (78%)

diff --git a/bigbio/hub/hub_repos/chebi/README.md b/bigbio/hub/hub_repos/chebi/README.md
new file mode 100644
index 00000000..f0e96121
--- /dev/null
+++ b/bigbio/hub/hub_repos/chebi/README.md
@@ -0,0 +1,43 @@
+
+---
+language: 
+- en
+bigbio_language: 
+- English
+license: unknown
+multilinguality: monolingual
+bigbio_license_shortname: UNKNOWN
+pretty_name: PPR
+homepage: https://github.com/DMCB-GIST/PPRcorpus
+bigbio_pubmed: True
+bigbio_public: True
+bigbio_tasks: 
+- NAMED_ENTITY_RECOGNITION
+- RELATION_EXTRACTION
+---
+
+
+# Dataset Card for PPR
+
+## Dataset Description
+
+- **Homepage:** https://github.com/DMCB-GIST/PPRcorpus
+- **Pubmed:** True
+- **Public:** True
+- **Tasks:** NER,RE
+
+The Plant-Phenotype corpus is a text corpus with human annotations of plants, phenotypes, and their relations on a corpus in 600 PubMed abstracts.
+
+## Citation Information
+
+```
+@article{cho2022plant,
+  author    = {Cho, Hyejin and Kim, Baeksoo and Choi, Wonjun and Lee, Doheon and Lee, Hyunju},
+  title     = {Plant phenotype relationship corpus for biomedical relationships between plants and phenotypes},
+  journal   = {Scientific Data},
+  volume    = {9},
+  year      = {2022},
+  publisher = {Nature Publishing Group},
+  doi       = {https://doi.org/10.1038/s41597-022-01350-1},
+}
+```
diff --git a/bigbio/hub/hub_repos/chebi/bigbiohub.py b/bigbio/hub/hub_repos/chebi/bigbiohub.py
new file mode 100644
index 00000000..a4792b4b
--- /dev/null
+++ b/bigbio/hub/hub_repos/chebi/bigbiohub.py
@@ -0,0 +1,592 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from enum import Enum
+import logging
+from pathlib import Path
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple
+
+import datasets
+
+if TYPE_CHECKING:
+    import bioc
+
+logger = logging.getLogger(__name__)
+
+
+BigBioValues = SimpleNamespace(NULL="<BB_NULL_STR>")
+
+
+@dataclass
+class BigBioConfig(datasets.BuilderConfig):
+    """BuilderConfig for BigBio."""
+
+    name: str = None
+    version: datasets.Version = None
+    description: str = None
+    schema: str = None
+    subset_id: str = None
+
+
+class Tasks(Enum):
+    NAMED_ENTITY_RECOGNITION = "NER"
+    NAMED_ENTITY_DISAMBIGUATION = "NED"
+    EVENT_EXTRACTION = "EE"
+    RELATION_EXTRACTION = "RE"
+    COREFERENCE_RESOLUTION = "COREF"
+    QUESTION_ANSWERING = "QA"
+    TEXTUAL_ENTAILMENT = "TE"
+    SEMANTIC_SIMILARITY = "STS"
+    TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS"
+    PARAPHRASING = "PARA"
+    TRANSLATION = "TRANSL"
+    SUMMARIZATION = "SUM"
+    TEXT_CLASSIFICATION = "TXTCLASS"
+
+
+entailment_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "premise": datasets.Value("string"),
+        "hypothesis": datasets.Value("string"),
+        "label": datasets.Value("string"),
+    }
+)
+
+pairs_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "text_1": datasets.Value("string"),
+        "text_2": datasets.Value("string"),
+        "label": datasets.Value("string"),
+    }
+)
+
+qa_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "question_id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "question": datasets.Value("string"),
+        "type": datasets.Value("string"),
+        "choices": [datasets.Value("string")],
+        "context": datasets.Value("string"),
+        "answer": datasets.Sequence(datasets.Value("string")),
+    }
+)
+
+text_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "text": datasets.Value("string"),
+        "labels": [datasets.Value("string")],
+    }
+)
+
+text2text_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "text_1": datasets.Value("string"),
+        "text_2": datasets.Value("string"),
+        "text_1_name": datasets.Value("string"),
+        "text_2_name": datasets.Value("string"),
+    }
+)
+
+kb_features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "document_id": datasets.Value("string"),
+        "passages": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "text": datasets.Sequence(datasets.Value("string")),
+                "offsets": datasets.Sequence([datasets.Value("int32")]),
+            }
+        ],
+        "entities": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "text": datasets.Sequence(datasets.Value("string")),
+                "offsets": datasets.Sequence([datasets.Value("int32")]),
+                "normalized": [
+                    {
+                        "db_name": datasets.Value("string"),
+                        "db_id": datasets.Value("string"),
+                    }
+                ],
+            }
+        ],
+        "events": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                # refers to the text_bound_annotation of the trigger
+                "trigger": {
+                    "text": datasets.Sequence(datasets.Value("string")),
+                    "offsets": datasets.Sequence([datasets.Value("int32")]),
+                },
+                "arguments": [
+                    {
+                        "role": datasets.Value("string"),
+                        "ref_id": datasets.Value("string"),
+                    }
+                ],
+            }
+        ],
+        "coreferences": [
+            {
+                "id": datasets.Value("string"),
+                "entity_ids": datasets.Sequence(datasets.Value("string")),
+            }
+        ],
+        "relations": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "arg1_id": datasets.Value("string"),
+                "arg2_id": datasets.Value("string"),
+                "normalized": [
+                    {
+                        "db_name": datasets.Value("string"),
+                        "db_id": datasets.Value("string"),
+                    }
+                ],
+            }
+        ],
+    }
+)
+
+
+TASK_TO_SCHEMA = {
+    Tasks.NAMED_ENTITY_RECOGNITION.name: "KB",
+    Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB",
+    Tasks.EVENT_EXTRACTION.name: "KB",
+    Tasks.RELATION_EXTRACTION.name: "KB",
+    Tasks.COREFERENCE_RESOLUTION.name: "KB",
+    Tasks.QUESTION_ANSWERING.name: "QA",
+    Tasks.TEXTUAL_ENTAILMENT.name: "TE",
+    Tasks.SEMANTIC_SIMILARITY.name: "PAIRS",
+    Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS",
+    Tasks.PARAPHRASING.name: "T2T",
+    Tasks.TRANSLATION.name: "T2T",
+    Tasks.SUMMARIZATION.name: "T2T",
+    Tasks.TEXT_CLASSIFICATION.name: "TEXT",
+}
+
+SCHEMA_TO_TASKS = defaultdict(set)
+for task, schema in TASK_TO_SCHEMA.items():
+    SCHEMA_TO_TASKS[schema].add(task)
+SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS)
+
+VALID_TASKS = set(TASK_TO_SCHEMA.keys())
+VALID_SCHEMAS = set(TASK_TO_SCHEMA.values())
+
+SCHEMA_TO_FEATURES = {
+    "KB": kb_features,
+    "QA": qa_features,
+    "TE": entailment_features,
+    "T2T": text2text_features,
+    "TEXT": text_features,
+    "PAIRS": pairs_features,
+}
+
+
+def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple:
+
+    offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations]
+
+    text = ann.text
+
+    if len(offsets) > 1:
+        i = 0
+        texts = []
+        for start, end in offsets:
+            chunk_len = end - start
+            texts.append(text[i : chunk_len + i])
+            i += chunk_len
+            while i < len(text) and text[i] == " ":
+                i += 1
+    else:
+        texts = [text]
+
+    return offsets, texts
+
+
+def remove_prefix(a: str, prefix: str) -> str:
+    if a.startswith(prefix):
+        a = a[len(prefix) :]
+    return a
+
+
+def parse_brat_file(
+    txt_file: Path,
+    annotation_file_suffixes: List[str] = None,
+    parse_notes: bool = False,
+) -> Dict:
+    """
+    Parse a brat file into the schema defined below.
+    `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt'
+    Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files,
+    e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'.
+    Will include annotator notes, when `parse_notes == True`.
+    brat_features = datasets.Features(
+        {
+            "id": datasets.Value("string"),
+            "document_id": datasets.Value("string"),
+            "text": datasets.Value("string"),
+            "text_bound_annotations": [  # T line in brat, e.g. type or event trigger
+                {
+                    "offsets": datasets.Sequence([datasets.Value("int32")]),
+                    "text": datasets.Sequence(datasets.Value("string")),
+                    "type": datasets.Value("string"),
+                    "id": datasets.Value("string"),
+                }
+            ],
+            "events": [  # E line in brat
+                {
+                    "trigger": datasets.Value(
+                        "string"
+                    ),  # refers to the text_bound_annotation of the trigger,
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "arguments": datasets.Sequence(
+                        {
+                            "role": datasets.Value("string"),
+                            "ref_id": datasets.Value("string"),
+                        }
+                    ),
+                }
+            ],
+            "relations": [  # R line in brat
+                {
+                    "id": datasets.Value("string"),
+                    "head": {
+                        "ref_id": datasets.Value("string"),
+                        "role": datasets.Value("string"),
+                    },
+                    "tail": {
+                        "ref_id": datasets.Value("string"),
+                        "role": datasets.Value("string"),
+                    },
+                    "type": datasets.Value("string"),
+                }
+            ],
+            "equivalences": [  # Equiv line in brat
+                {
+                    "id": datasets.Value("string"),
+                    "ref_ids": datasets.Sequence(datasets.Value("string")),
+                }
+            ],
+            "attributes": [  # M or A lines in brat
+                {
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "ref_id": datasets.Value("string"),
+                    "value": datasets.Value("string"),
+                }
+            ],
+            "normalizations": [  # N lines in brat
+                {
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "ref_id": datasets.Value("string"),
+                    "resource_name": datasets.Value(
+                        "string"
+                    ),  # Name of the resource, e.g. "Wikipedia"
+                    "cuid": datasets.Value(
+                        "string"
+                    ),  # ID in the resource, e.g. 534366
+                    "text": datasets.Value(
+                        "string"
+                    ),  # Human readable description/name of the entity, e.g. "Barack Obama"
+                }
+            ],
+            ### OPTIONAL: Only included when `parse_notes == True`
+            "notes": [  # # lines in brat
+                {
+                    "id": datasets.Value("string"),
+                    "type": datasets.Value("string"),
+                    "ref_id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                }
+            ],
+        },
+        )
+    """
+
+    example = {}
+    example["document_id"] = txt_file.with_suffix("").name
+    with txt_file.open() as f:
+        example["text"] = f.read()
+
+    # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes
+    # for event extraction
+    if annotation_file_suffixes is None:
+        annotation_file_suffixes = [".a1", ".a2", ".ann"]
+
+    if len(annotation_file_suffixes) == 0:
+        raise AssertionError(
+            "At least one suffix for the to-be-read annotation files should be given!"
+        )
+
+    ann_lines = []
+    for suffix in annotation_file_suffixes:
+        annotation_file = txt_file.with_suffix(suffix)
+        try:
+            with annotation_file.open() as f:
+                ann_lines.extend(f.readlines())
+        except Exception:
+            continue
+
+    example["text_bound_annotations"] = []
+    example["events"] = []
+    example["relations"] = []
+    example["equivalences"] = []
+    example["attributes"] = []
+    example["normalizations"] = []
+
+    if parse_notes:
+        example["notes"] = []
+
+    for line in ann_lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        if line.startswith("T"):  # Text bound
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["type"] = fields[1].split()[0]
+            ann["offsets"] = []
+            span_str = remove_prefix(fields[1], (ann["type"] + " "))
+            text = fields[2]
+            for span in span_str.split(";"):
+                start, end = span.split()
+                ann["offsets"].append([int(start), int(end)])
+
+            # Heuristically split text of discontiguous entities into chunks
+            ann["text"] = []
+            if len(ann["offsets"]) > 1:
+                i = 0
+                for start, end in ann["offsets"]:
+                    chunk_len = end - start
+                    ann["text"].append(text[i : chunk_len + i])
+                    i += chunk_len
+                    while i < len(text) and text[i] == " ":
+                        i += 1
+            else:
+                ann["text"] = [text]
+
+            example["text_bound_annotations"].append(ann)
+
+        elif line.startswith("E"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+
+            ann["type"], ann["trigger"] = fields[1].split()[0].split(":")
+
+            ann["arguments"] = []
+            for role_ref_id in fields[1].split()[1:]:
+                argument = {
+                    "role": (role_ref_id.split(":"))[0],
+                    "ref_id": (role_ref_id.split(":"))[1],
+                }
+                ann["arguments"].append(argument)
+
+            example["events"].append(ann)
+
+        elif line.startswith("R"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["type"] = fields[1].split()[0]
+
+            ann["head"] = {
+                "role": fields[1].split()[1].split(":")[0],
+                "ref_id": fields[1].split()[1].split(":")[1],
+            }
+            ann["tail"] = {
+                "role": fields[1].split()[2].split(":")[0],
+                "ref_id": fields[1].split()[2].split(":")[1],
+            }
+
+            example["relations"].append(ann)
+
+        # '*' seems to be the legacy way to mark equivalences,
+        # but I couldn't find any info on the current way
+        # this might have to be adapted dependent on the brat version
+        # of the annotation
+        elif line.startswith("*"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["ref_ids"] = fields[1].split()[1:]
+
+            example["equivalences"].append(ann)
+
+        elif line.startswith("A") or line.startswith("M"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+
+            info = fields[1].split()
+            ann["type"] = info[0]
+            ann["ref_id"] = info[1]
+
+            if len(info) > 2:
+                ann["value"] = info[2]
+            else:
+                ann["value"] = ""
+
+            example["attributes"].append(ann)
+
+        elif line.startswith("N"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["text"] = fields[2]
+
+            info = fields[1].split()
+
+            ann["type"] = info[0]
+            ann["ref_id"] = info[1]
+            ann["resource_name"] = info[2].split(":")[0]
+            ann["cuid"] = info[2].split(":")[1]
+            example["normalizations"].append(ann)
+
+        elif parse_notes and line.startswith("#"):
+            ann = {}
+            fields = line.split("\t")
+
+            ann["id"] = fields[0]
+            ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL
+
+            info = fields[1].split()
+
+            ann["type"] = info[0]
+            ann["ref_id"] = info[1]
+            example["notes"].append(ann)
+
+    return example
+
+
+def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict:
+    """
+    Transform a brat parse (conforming to the standard brat schema) obtained with
+    `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py)
+    :param brat_parse:
+    """
+
+    unified_example = {}
+
+    # Prefix all ids with document id to ensure global uniqueness,
+    # because brat ids are only unique within their document
+    id_prefix = brat_parse["document_id"] + "_"
+
+    # identical
+    unified_example["document_id"] = brat_parse["document_id"]
+    unified_example["passages"] = [
+        {
+            "id": id_prefix + "_text",
+            "type": "abstract",
+            "text": [brat_parse["text"]],
+            "offsets": [[0, len(brat_parse["text"])]],
+        }
+    ]
+
+    # get normalizations
+    ref_id_to_normalizations = defaultdict(list)
+    for normalization in brat_parse["normalizations"]:
+        ref_id_to_normalizations[normalization["ref_id"]].append(
+            {
+                "db_name": normalization["resource_name"],
+                "db_id": normalization["cuid"],
+            }
+        )
+
+    # separate entities and event triggers
+    unified_example["events"] = []
+    non_event_ann = brat_parse["text_bound_annotations"].copy()
+    for event in brat_parse["events"]:
+        event = event.copy()
+        event["id"] = id_prefix + event["id"]
+        trigger = next(
+            tr
+            for tr in brat_parse["text_bound_annotations"]
+            if tr["id"] == event["trigger"]
+        )
+        if trigger in non_event_ann:
+            non_event_ann.remove(trigger)
+        event["trigger"] = {
+            "text": trigger["text"].copy(),
+            "offsets": trigger["offsets"].copy(),
+        }
+        for argument in event["arguments"]:
+            argument["ref_id"] = id_prefix + argument["ref_id"]
+
+        unified_example["events"].append(event)
+
+    unified_example["entities"] = []
+    anno_ids = [ref_id["id"] for ref_id in non_event_ann]
+    for ann in non_event_ann:
+        entity_ann = ann.copy()
+        entity_ann["id"] = id_prefix + entity_ann["id"]
+        entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]]
+        unified_example["entities"].append(entity_ann)
+
+    # massage relations
+    unified_example["relations"] = []
+    skipped_relations = set()
+    for ann in brat_parse["relations"]:
+        if (
+            ann["head"]["ref_id"] not in anno_ids
+            or ann["tail"]["ref_id"] not in anno_ids
+        ):
+            skipped_relations.add(ann["id"])
+            continue
+        unified_example["relations"].append(
+            {
+                "arg1_id": id_prefix + ann["head"]["ref_id"],
+                "arg2_id": id_prefix + ann["tail"]["ref_id"],
+                "id": id_prefix + ann["id"],
+                "type": ann["type"],
+                "normalized": [],
+            }
+        )
+    if len(skipped_relations) > 0:
+        example_id = brat_parse["document_id"]
+        logger.info(
+            f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities."
+            f" Skip (for now): "
+            f"{list(skipped_relations)}"
+        )
+
+    # get coreferences
+    unified_example["coreferences"] = []
+    for i, ann in enumerate(brat_parse["equivalences"], start=1):
+        is_entity_cluster = True
+        for ref_id in ann["ref_ids"]:
+            if not ref_id.startswith("T"):  # not textbound -> no entity
+                is_entity_cluster = False
+            elif ref_id not in anno_ids:  # event trigger -> no entity
+                is_entity_cluster = False
+        if is_entity_cluster:
+            entity_ids = [id_prefix + i for i in ann["ref_ids"]]
+            unified_example["coreferences"].append(
+                {"id": id_prefix + str(i), "entity_ids": entity_ids}
+            )
+    return unified_example
diff --git a/biodatasets/chebi/chebi.py b/bigbio/hub/hub_repos/chebi/chebi.py
similarity index 78%
rename from biodatasets/chebi/chebi.py
rename to bigbio/hub/hub_repos/chebi/chebi.py
index 0b47b29d..2a2c0c1c 100644
--- a/biodatasets/chebi/chebi.py
+++ b/bigbio/hub/hub_repos/chebi/chebi.py
@@ -13,46 +13,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""
-ChEBI Chapti contains the results of a collaboration between the European Patent Office and
-the ChEBI team. The goal of the project was to identify chemicals within patents and cross-
-reference them to ChEBI. The teams manually annotated chemicals in a set of 40 patents.
-This was used to measure the performance of the various text-mining tools. This set of
-40 patents is distributed in this directory. The results of this work can be seen on the ChEBI
-website.
- 
-This work is distributed under the Creative Commons license: http://creativecommons.org/licenses/by/3.0/
-"""
-
-import os
 from pathlib import Path
-from typing import Any, List, Tuple, Dict
+from typing import Any, Dict, List, Tuple
 
 import datasets
 from lxml import etree
-from utils import schemas
-from utils.configs import BigBioConfig
-from utils.constants import Tasks
+
+from .bigbiohub import BigBioConfig, Tasks, kb_features
+
+_LANGUAGES = ["English"]
+_PUBMED = False
+_LOCAL = False
 
 _CITATION = """\
 @article{,
-    title = {ChEBI: a database and ontology for chemical entities of biological interest},
-	author = {Degtyarenko, Kirill and de Matos, Paula and Ennis, Marcus and Hastings, Janna and Zbinden, Martin and McNaught, Alan and Alcántara, Rafael and Darsow, Michael and Guedj, Mickaël and Ashburner, Michael},
-	doi = {10.1093/nar/gkm791},
-	number = {Database issue},
-	volume = {36},
-	month = {January},
-	year = {2008},
-	journal = {Nucleic acids research},
-	issn = {0305-1048},
-	pages = {D344—50},
-	url = {https://europepmc.org/articles/PMC2238832},
-    biburl    = {https://aclanthology.org/W19-5008.bib},
-    bibsource = {https://aclanthology.org/W19-5008/}
+title = {ChEBI: a database and ontology for chemical entities of biological interest},
+author = {Degtyarenko, Kirill and de Matos, Paula and Ennis, Marcus and Hastings, Janna and Zbinden, Martin and \
+    McNaught, Alan and Alcántara, Rafael and Darsow, Michael and Guedj, Mickaël and Ashburner, Michael},
+doi = {10.1093/nar/gkm791},
+number = {Database issue},
+volume = {36},
+month = {January},
+year = {2008},
+journal = {Nucleic acids research},
+issn = {0305-1048},
+pages = {D344—50},
+url = {https://europepmc.org/articles/PMC2238832},
+biburl = {https://aclanthology.org/W19-5008.bib},
+bibsource = {https://aclanthology.org/W19-5008/}
 }
 """
 
 _DATASETNAME = "chebi"
+_DISPLAYNAME = "Chebi"
 
 _DESCRIPTION = """\
 ChEBI Chapti contains the results of a collaboration between the European Patent Office and
@@ -64,23 +57,23 @@
 """
 
 _HOMEPAGE = "http://chebi.cvs.sourceforge.net/viewvc/chebi/chapati/"
+_LICENSE = "CC_BY_SA_4p0"
 
-_LICENSE = "Creative Commons License Attribution-ShareAlike 4.0 International"
-
+DATA_URL = "https://github.com/bigscience-workshop/biomedical/files/8568960/PatentAnnotations_GoldStandard.tar.gz"
 _URLS = {
     # The original dataset is hosted on CVS on sourceforge. Hence I have downloaded and reuploded it as tar.gz format.
     # Converted via the following command:
-    # cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/chebi co chapati/patentsGoldStandard/PatentAnnotations_GoldStandard.tgz
+    # cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/chebi co \
+    #   chapati/patentsGoldStandard/PatentAnnotations_GoldStandard.tgz
     # mkdir -p ./MoNERo
     # pushd ./MoNERo && 7z x ../MoNERo_2019.7z && popd
     # tar -czf MoNERo.tar.gz ./MoNERo
-    _DATASETNAME: "https://github.com/bigscience-workshop/biomedical/files/8568960/PatentAnnotations_GoldStandard.tar.gz",
+    _DATASETNAME: DATA_URL,
 }
 
 _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION]
 
 _SOURCE_VERSION = "1.0.0"
-
 _BIGBIO_VERSION = "1.0.0"
 
 
@@ -133,9 +126,10 @@ def _info(self) -> datasets.DatasetInfo:
                     ],
                 }
             )
-
         elif self.config.schema == "bigbio_kb":
-            features = schemas.kb_features
+            features = kb_features
+        else:
+            raise NotImplementedError(f"Schema {self.config.schema} is not supported")
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -150,13 +144,11 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         urls = _URLS[_DATASETNAME]
         data_dir = dl_manager.download_and_extract(urls)
         data_dir = Path(data_dir) / "scrapbook"
-        file_paths = list(data_dir.glob("./*/source.xml"))
-        print(len(file_paths))
+        file_paths = list(sorted(data_dir.glob("./*/source.xml")))
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                # Whatever you put in gen_kwargs will be passed to _generate_examples
                 gen_kwargs={
                     "file_paths": file_paths,
                     "split": "train",
@@ -203,14 +195,16 @@ def _read_example_from_file(self, filepath: Path) -> Tuple[str, Dict]:
         document_text = []
         entities = []
         start = 0
+
         for para in xml.iter("P", "p"):
-            # print(para.text)
             para_text, para_entities = self._parse_paragraph(para, start=start)
             document_text.append(para_text)
             start += len(para_text)
             entities.extend(para_entities)
+
         document_text = "".join(document_text)
         example = {"doc_id": key, "text": document_text, "entities": entities}
+
         return key, example
 
     def _parse_example_to_kb_schema(self, example) -> Dict[str, Any]:
@@ -231,7 +225,9 @@ def _parse_example_to_kb_schema(self, example) -> Dict[str, Any]:
                 "text": [e["phrase"]],
                 "offsets": [[e["start"], e["end"]]],
                 "type": e["attrs"]["type"],
-                "normalized": [{"db_name": "chebi", "db_id": e["attrs"]["chebi-id"]}],
+                "normalized": [
+                    {"db_name": "chebi", "db_id": chebi_id.strip()} for chebi_id in e["attrs"]["chebi-id"].split(",")
+                ],
             }
             entities.append(entity)
         data = {
@@ -249,7 +245,3 @@ def _read_example_from_file_in_kb_schema(self, filepath: Path) -> Tuple[str, Dic
         key, example = self._read_example_from_file(filepath)
         example = self._parse_example_to_kb_schema(example)
         return key, example
-
-
-if __name__ == "__main__":
-    datasets.load_dataset(__file__)

From 1793774bbabe65c00dec81da6e45e5b020b26159 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Sat, 26 Oct 2024 09:32:17 +0200
Subject: [PATCH 5/6] fix: Remove init files for consistency

---
 bigbio/hub/hub_repos/czi_drsm/__init__.py | 0
 bigbio/hub/hub_repos/flambe/__init__.py   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 bigbio/hub/hub_repos/czi_drsm/__init__.py
 delete mode 100644 bigbio/hub/hub_repos/flambe/__init__.py

diff --git a/bigbio/hub/hub_repos/czi_drsm/__init__.py b/bigbio/hub/hub_repos/czi_drsm/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/bigbio/hub/hub_repos/flambe/__init__.py b/bigbio/hub/hub_repos/flambe/__init__.py
deleted file mode 100644
index e69de29b..00000000

From 44c413e4884ce3fb18ba0448aba5d26c3e5cd846 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mario=20S=C3=A4nger?= <saengema@informatik.hu-berlin.de>
Date: Sat, 26 Oct 2024 09:44:04 +0200
Subject: [PATCH 6/6] fix: Update README.md

---
 bigbio/hub/hub_repos/chebi/README.md | 54 +++++++++++++++++-----------
 bigbio/hub/hub_repos/chebi/chebi.py  |  2 +-
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/bigbio/hub/hub_repos/chebi/README.md b/bigbio/hub/hub_repos/chebi/README.md
index f0e96121..17f50aae 100644
--- a/bigbio/hub/hub_repos/chebi/README.md
+++ b/bigbio/hub/hub_repos/chebi/README.md
@@ -4,40 +4,52 @@ language:
 - en
 bigbio_language: 
 - English
-license: unknown
+license: CC_BY_SA_4p0
 multilinguality: monolingual
-bigbio_license_shortname: UNKNOWN
-pretty_name: PPR
-homepage: https://github.com/DMCB-GIST/PPRcorpus
-bigbio_pubmed: True
-bigbio_public: True
+bigbio_license_shortname: CC_BY_SA_4p0
+pretty_name: Chebi
+homepage: https://sourceforge.net/projects/chebi/
+bigbio_pubmed: False
+bigbio_public: False
 bigbio_tasks: 
 - NAMED_ENTITY_RECOGNITION
-- RELATION_EXTRACTION
+- NAMED_ENTITY_DISAMBIGUATION
 ---
 
 
-# Dataset Card for PPR
+# Dataset Card for Chebi
 
 ## Dataset Description
 
-- **Homepage:** https://github.com/DMCB-GIST/PPRcorpus
-- **Pubmed:** True
-- **Public:** True
-- **Tasks:** NER,RE
+- **Homepage:** https://sourceforge.net/projects/chebi/
+- **Pubmed:** False
+- **Public:** False
+- **Tasks:** NER,NED
 
-The Plant-Phenotype corpus is a text corpus with human annotations of plants, phenotypes, and their relations on a corpus in 600 PubMed abstracts.
+ChEBI Chapti contains the results of a collaboration between the European Patent Office and
+the ChEBI team. The goal of the project was to identify chemicals within patents and cross-
+reference them to ChEBI. The teams manually annotated chemicals in a set of 40 patents.
+This was used to measure the performance of the various text-mining tools. This set of
+40 patents is distributed in this directory. The results of this work can be seen on the ChEBI
+website.
 
 ## Citation Information
 
 ```
-@article{cho2022plant,
-  author    = {Cho, Hyejin and Kim, Baeksoo and Choi, Wonjun and Lee, Doheon and Lee, Hyunju},
-  title     = {Plant phenotype relationship corpus for biomedical relationships between plants and phenotypes},
-  journal   = {Scientific Data},
-  volume    = {9},
-  year      = {2022},
-  publisher = {Nature Publishing Group},
-  doi       = {https://doi.org/10.1038/s41597-022-01350-1},
+@article{,
+title = {ChEBI: a database and ontology for chemical entities of biological interest},
+author = {Degtyarenko, Kirill and de Matos, Paula and Ennis, Marcus and Hastings, Janna and Zbinden, Martin and \
+    McNaught, Alan and Alcántara, Rafael and Darsow, Michael and Guedj, Mickaël and Ashburner, Michael},
+doi = {10.1093/nar/gkm791},
+number = {Database issue},
+volume = {36},
+month = {January},
+year = {2008},
+journal = {Nucleic acids research},
+issn = {0305-1048},
+pages = {D344—50},
+url = {https://europepmc.org/articles/PMC2238832},
+biburl = {https://aclanthology.org/W19-5008.bib},
+bibsource = {https://aclanthology.org/W19-5008/}
 }
 ```
diff --git a/bigbio/hub/hub_repos/chebi/chebi.py b/bigbio/hub/hub_repos/chebi/chebi.py
index 2a2c0c1c..d947156a 100644
--- a/bigbio/hub/hub_repos/chebi/chebi.py
+++ b/bigbio/hub/hub_repos/chebi/chebi.py
@@ -56,7 +56,7 @@
 website.
 """
 
-_HOMEPAGE = "http://chebi.cvs.sourceforge.net/viewvc/chebi/chapati/"
+_HOMEPAGE = "https://sourceforge.net/projects/chebi/"
 _LICENSE = "CC_BY_SA_4p0"
 
 DATA_URL = "https://github.com/bigscience-workshop/biomedical/files/8568960/PatentAnnotations_GoldStandard.tar.gz"