Skip to content

Commit

Permalink
add Builder for AI2D-Caption dataset.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 685722963
  • Loading branch information
pierrot0 authored and The TensorFlow Datasets Authors committed Oct 14, 2024
1 parent 1b317d2 commit bc48d05
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 0 deletions.
6 changes: 6 additions & 0 deletions tensorflow_datasets/datasets/ai2dcaption/CITATIONS.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@inproceedings{Zala2024DiagrammerGPT,
author = {Abhay Zala and Han Lin and Jaemin Cho and Mohit Bansal},
title = {DiagrammerGPT: Generating Open-Domain, Open-Platform Diagrams via LLM Planning},
year = {2024},
booktitle = {COLM},
}
5 changes: 5 additions & 0 deletions tensorflow_datasets/datasets/ai2dcaption/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This dataset is primarily based off the AI2D Dataset (see [here](
https://prior.allenai.org/projects/diagram-understanding)).

See [Section 4.1](https://arxiv.org/pdf/2310.12128) of our paper for
the AI2D-Caption dataset annotation process.
3 changes: 3 additions & 0 deletions tensorflow_datasets/datasets/ai2dcaption/TAGS.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
content.data-type.image # Contains image data.
content.language.en # Contains text in language English / en.
content.subject.biology # Relates to biology.
15 changes: 15 additions & 0 deletions tensorflow_datasets/datasets/ai2dcaption/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2024 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# coding=utf-8
# Copyright 2024 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""AI2DCaption dataset."""

import json
import os.path

import tensorflow_datasets.public_api as tfds

LAYOUT_NAMES = [
'abstract',
'circular',
'columns',
'linear',
'rows',
'tree',
'unspecified',
]

TOPIC_NAMES = ['astronomy', 'biology', 'engineering', 'unspecified']

TYPE_NAMES = [
'arrow',
'image',
'object',
'relationship',
'text',
]

CATEGORIES = [
'imageCaption',
'imageTitle',
'interObjectLinkage',
'intraObjectLabel',
'intraObjectLinkage',
'intraObjectRegionLabel',
'intraObjectTextLinkage',
'misc',
'sectionTitle',
'unspecified',
]

SPLITS = [
'auditor_llm_training_examples',
'gpt4v',
'llava_15',
'planner_llm_training_examples',
'test',
]

JSON_URL_TMPL = 'https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_{split}.json?download=true'

IMAGES_URL = 'http://ai2-website.s3.amazonaws.com/data/ai2d-all.zip'


class Builder(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for AI2DCaption dataset."""

VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {
'1.0.0': 'Initial release.',
}

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
return self.dataset_info_from_configs(
features=tfds.features.FeaturesDict({
'image': tfds.features.Image(
shape=(None, None, 3),
doc=tfds.features.Documentation(
desc='The image of the diagram.',
),
),
'image_filename': tfds.features.Text(
doc=tfds.features.Documentation(
desc='Image filename. e.g. "1337.png"',
),
),
'topic': tfds.features.ClassLabel(names=TOPIC_NAMES),
'layout': tfds.features.ClassLabel(names=LAYOUT_NAMES),
'caption': tfds.features.Text(),
'relationships': tfds.features.Sequence(tfds.features.Text()),
'entities': tfds.features.Sequence(
tfds.features.FeaturesDict({
'id': tfds.features.Text(),
'type': tfds.features.ClassLabel(names=TYPE_NAMES),
'label': tfds.features.Text(),
'bounds': tfds.features.BBoxFeature(),
# Not always specified:
'cat': tfds.features.ClassLabel(names=CATEGORIES),
'from': tfds.features.Text(),
'to': tfds.features.Text(),
})
),
}),
supervised_keys=None,
homepage='https://huggingface.co/datasets/abhayzala/AI2D-Caption',
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
paths = {split: JSON_URL_TMPL.format(split=split) for split in SPLITS}
paths['images'] = IMAGES_URL
dl_paths = dl_manager.download(paths)

return {
split: self._generate_examples(
split, dl_paths[split], dl_paths['images']
)
for split in SPLITS
}

def _generate_examples(self, split, json_path, images_path):
"""Yields examples."""
# Build an images index from JSON:
json_data = json.loads(json_path.read_text(encoding='utf-8'))
metadata_by_filename = {} # Maps from image id/filename to image metadata.
for image_metadata in json_data:
metadata_by_filename[image_metadata['image']] = image_metadata
# Iterate over the images,ß yield the ones present in metadata_by_filename:
for image_path, file in tfds.download.iter_archive(
images_path, tfds.download.ExtractMethod.ZIP
):
if not image_path.startswith('ai2d/images/'):
continue
image_id = os.path.basename(image_path)
if (metadata := metadata_by_filename.get(image_id)) is None:
continue
# Convert bounding box format from REL_XYXY to TFDS format.
entities = list(metadata['entities'].values())
for entity in entities:
# auditor_llm_training_examples split has non-sense bounds (max<min).
if (
bounds := entity.get('bounds')
) and split != 'auditor_llm_training_examples':
xmin, ymin, xmax, ymax = [c / 100.0 for c in bounds]
else:
xmin, ymin, xmax, ymax = 0.0, 0.0, 0.0, 0.0
entity['bounds'] = tfds.features.BBox(ymin, xmin, ymax, xmax)
entity.setdefault('label', '')
entity.setdefault('cat', 'unspecified')
entity.setdefault('from', '')
entity.setdefault('to', '')
relationships = metadata.get('relationships', [])
# ai2d_caption_test.json has a few relationships expressed as a dict.
if isinstance(relationships, dict):
relationships = list(relationships.values())
yield image_id, {
'image_filename': image_id,
'image': file,
'topic': metadata.get('topic', 'unspecified'),
# layout may be an empty string, hence the following construct.
'layout': metadata.get('layout', None) or 'unspecified',
'caption': metadata.get('caption', ''),
'relationships': relationships,
'entities': entities,
}
6 changes: 6 additions & 0 deletions tensorflow_datasets/datasets/ai2dcaption/checksums.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
http://ai2-website.s3.amazonaws.com/data/ai2d-all.zip 990965374 1a6b77eebb8b7dbdf76a0ba6ca76c2f97ce8f81d8ee33b06593aa722e54c4786 ai2d-all.zip
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_auditor_llm_training_examples.json?download=true 77845 3af7ef3f3c9e48183b78c521541ac7097156dc80421a2ee21935c95d76c6221e ai2d_caption_auditor_llm_training_examples.json
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_gpt4v.json?download=true 29115582 10aac517f432f384dad6552a88e660ef5966ade75d1550581cc0ad3985d20b43 ai2d_caption_gpt4v.json
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_llava_15.json?download=true 29925810 962c923b80bc4f62621ce5b3d572d3bd52d2daef3d68af0999a5ef01d8170a9f ai2d_caption_llava_15.json
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_planner_llm_training_examples.json?download=true 73533 43f1dd0e449f7bdf5c426ddefc8cb9f663757b22c77a4c305cba8c49a2c2ea8f ai2d_caption_planner_llm_training_examples.json
https://huggingface.co/datasets/abhayzala/AI2D-Caption/resolve/main/ai2d_caption_test.json?download=true 192643 54e279ba96177d78c4e9c4e8311c17272b94dc9c5ce5a1a1c701ad84e3a2db48 ai2d_caption_test.json

0 comments on commit bc48d05

Please sign in to comment.