Skip to content

Commit

Permalink
all the datasets prepared
Browse files Browse the repository at this point in the history
  • Loading branch information
caradryanl committed May 15, 2024
1 parent d76cbd0 commit 4016a42
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 15 deletions.
53 changes: 53 additions & 0 deletions diffusers/utils/get_celeba_hq_2_5k_eval_img_caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import argparse
import pandas as pd
import json, os

import requests
from PIL import Image
from io import BytesIO
import base64



def main(args):
dataset = args.dataset
target = args.target
num_images = args.num_images
# Load Parquet file into a pandas DataFrame
df = pd.read_parquet(dataset)
print(df.info())

if not os.path.exists(target + 'images/'):
os.mkdir(target + 'images/')

# 32600
caption = {}
for idx in range(num_images):
# print(df.iloc[0, 0])
# image_bytes = base64.b64decode(df.iloc[0, 0]['bytes'])
image_bytes = df.iloc[idx, 0]['bytes']
image = Image.open(BytesIO(image_bytes))
image = image.convert('RGB') # Ensure it's in RGB mode for saving as JPEG
image.save(target + 'images/' + f'{idx}.png', 'PNG')

caption[idx] = {
"path": target + 'images/' + f'{idx}.png',
"height": 256,
"width": 256,
"caption": []
}
with open(target + 'caption.json', 'w') as file:
json.dump(caption, file, indent=4)
# with open(target + 'failure_download.json', 'w') as file:
# json.dump(failure, file, indent=4)

# print("Available image number: {}".format(num_images - len(failure)))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--num-images", type=int, default=2500)
parser.add_argument("--dataset", type=str, default="datasets/celeba-hq-2-5k/train-00001-of-00006.parquet")
parser.add_argument("--target", type=str, default="datasets/celeba-hq-2-5k-eval/")
args = parser.parse_args()
main(args)
48 changes: 48 additions & 0 deletions diffusers/utils/get_coco_2_5k_eval_img_caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os, shutil
import argparse
import pandas as pd
import json

def main(args):
dataset = args.dataset
target = args.target
training = args.training
num_images = args.num_images

with open(dataset + 'caption.json', 'r') as json_file:
caption = json.load(json_file)

if not os.path.exists(target + 'images/'):
os.mkdir(target + 'images/')

training_list = os.listdir(training+ 'images/')

data = {}
cnt, in_cnt = 0, 0
for id, metadata in caption.items():
img_path = metadata["path"]
if img_path in training_list:
in_cnt += 1
continue

data[id] = metadata
source = dataset + 'images/' + img_path
dest = target + 'images/' + img_path
shutil.copy(source, dest)

cnt += 1
if cnt >= num_images:
break
print(cnt, in_cnt)
with open(target + 'caption.json', 'w') as file:
json.dump(data, file, indent=4)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--num-images", type=int, default=2500)
parser.add_argument("--dataset", type=str, default="datasets/coco2017-val/")
parser.add_argument("--training", type=str, default="datasets/coco2017-val-2-5k/")
parser.add_argument("--target", type=str, default="datasets/coco2017-val-2-5k-eval/")
args = parser.parse_args()
main(args)
4 changes: 4 additions & 0 deletions diffusers/utils/get_datasets_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
python utils/get_laion_aesthetic_2_5k_eval_img_caption.py
python utils/get_coco_2_5k_eval_img_caption.py
python utils/get_celeba_hq_2_5k_eval_img_caption.py
python utils/get_ffhq_2_5k_eval_img_caption.py
53 changes: 53 additions & 0 deletions diffusers/utils/get_ffhq_2_5k_eval_img_caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import argparse
import pandas as pd
import json,os

import requests
from PIL import Image
from io import BytesIO
import base64



def main(args):
dataset = args.dataset
target = args.target
num_images = args.num_images
# Load Parquet file into a pandas DataFrame
df = pd.read_parquet(dataset)
print(df.info())

if not os.path.exists(target + 'images/'):
os.mkdir(target + 'images/')

# 32600
caption = {}
for idx in range(num_images):
# print(df.iloc[0, 0]['bytes'])
# image_bytes = base64.b64decode(df.iloc[0, 0]['bytes'])
image_bytes = df.iloc[idx, 0]['bytes']
image = Image.open(BytesIO(image_bytes))
image = image.convert('RGB') # Ensure it's in RGB mode for saving as JPEG
image.save(target + 'images/' +df.iloc[idx, 0]['path'], 'PNG')

caption[idx] = {
"path": df.iloc[idx, 0]['path'],
"height": 256,
"width": 256,
"caption": []
}
with open(target + 'caption.json', 'w') as file:
json.dump(caption, file, indent=4)
# with open(target + 'failure_download.json', 'w') as file:
# json.dump(failure, file, indent=4)

# print("Available image number: {}".format(num_images - len(failure)))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--num-images", type=int, default=2500)
parser.add_argument("--dataset", type=str, default="datasets/ffhq-2-5k/train-00001-of-00015-78537a18b94b8879.parquet")
parser.add_argument("--target", type=str, default="datasets/ffhq-2-5k-eval/")
args = parser.parse_args()
main(args)
20 changes: 9 additions & 11 deletions diffusers/utils/get_hakubooru.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,45 +15,43 @@
logger.info("Loading danbooru2023.db")
load_db("datasets/danbooru2023/metadata/danbooru2023.db")

num_samples = 10


logger.info("Querying posts")
# Querying posts for:
# All the post after 5_000_000
# 1/2 of the post before 5_000_000, after 3_000_000
# 1/3 of the post before 3_000_000
# Use seed_everything(1) to make the result reproducible
seed_everything(1)
member_choosed_post = (
choosed_post = (
list(Post.select().where(Post.id >= 5_000_000))
+ choices(
Post.select().where(Post.id < 5_000_000, Post.id >= 3_000_000), k=1_000_000
)
+ choices(Post.select().where(Post.id < 3_000_000), k=1_000_000)
)
nonmember_choosed_post = (
[item for item in list(Post.select().where(Post.id < 5_000_000)) if item not in member_choosed_post]
)

logger.info(f"Build exporter for members")
exporter = Exporter(
source=TarSource("datasets/danbooru2023/data"),
saver=FileSaver("datasets/danbooru2023/images_member"),
captioner=KohakuCaptioner(),
process_batch_size=100000,
process_batch_size=1000,
process_threads=2,
)
logger.info(f"Found {len(member_choosed_post)} posts")
logger.info(f"Found {len(choosed_post)} posts")
logger.info(f"Exporting images for members")
exporter.export_posts(member_choosed_post)
exporter.export_posts(choosed_post)

nonmember_choosed_post = (
list(set(list(Post.select().where(Post.id < 5_000_000))) - set(choosed_post))
)

logger.info(f"Build exporter for non-members")
exporter = Exporter(
source=TarSource("datasets/danbooru2023/data"),
saver=FileSaver("datasets/danbooru2023/images_nonmember"),
captioner=KohakuCaptioner(),
process_batch_size=100000,
process_batch_size=1000,
process_threads=2,
)
logger.info(f"Found {len(nonmember_choosed_post)} posts")
Expand Down
10 changes: 6 additions & 4 deletions diffusers/utils/get_hakubooru_2_5k_img_caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def list_files_with_extension(directory, extension):

# Get a list of all files with the given extension
files = glob.glob(os.path.join(directory, '**', '*' + extension), recursive=True)

return files
filenames = [os.path.basename(file) for file in files]
return filenames


if __name__ == "__main__":
Expand All @@ -39,6 +39,7 @@ def list_files_with_extension(directory, extension):
if cnt == num_samples:
with open(target_datasets[0] + 'caption.json', 'w') as file:
json.dump(caption, file, indent=4)
caption = {}

id = member_file[:-5]
txt_file = id + '.txt'
Expand All @@ -57,11 +58,11 @@ def list_files_with_extension(directory, extension):
with open(target_datasets[1] + 'caption.json', 'w') as file:
json.dump(caption, file, indent=4)

# select hakubooru-member
# select hakubooru-nonmember
nonmember_dataset = 'datasets/danbooru2023/images_nonmember/'
target_datasets = ['datasets/hakubooru-2-5k-nonmember/', 'datasets/hakubooru-2-5k-eval-nonmember/']
nonmember_files = list_files_with_extension(nonmember_dataset, '.webp')
print(f"number of members: {len(member_files)}")
print(f"number of nonmembers: {len(nonmember_files)}")
cnt = 0
caption = {}
for nonmember_file in nonmember_files:
Expand All @@ -75,6 +76,7 @@ def list_files_with_extension(directory, extension):
if cnt == num_samples:
with open(target_datasets[0] + 'caption.json', 'w') as file:
json.dump(caption, file, indent=4)
caption = {}

id = nonmember_file[:-5]
txt_file = id + '.txt'
Expand Down
50 changes: 50 additions & 0 deletions diffusers/utils/get_laion_aesthetic_2_5k_eval_img_caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os, shutil
import argparse
import pandas as pd
import json

def main(args):
dataset = args.dataset
target = args.target
num_images = args.num_images
training = args.training

with open(dataset + 'caption.json', 'r') as json_file:
caption = json.load(json_file)

if not os.path.exists(target + 'images/'):
os.mkdir(target + 'images/')

training_list = os.listdir(training+ 'images/')

data = {}
cnt, in_cnt = 0, 0
for id, metadata in caption.items():

img_path = metadata["path"]
if img_path in training_list:
in_cnt += 1
continue

data[id] = metadata
source = dataset + 'images/' + img_path
dest = target + 'images/' + img_path
shutil.copy(source, dest)


cnt += 1
if cnt >= num_images:
break
print(cnt, in_cnt)
with open(target + 'caption.json', 'w') as file:
json.dump(data, file, indent=4)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--num-images", type=int, default=2500)
parser.add_argument("--dataset", type=str, default="datasets/laion-aesthetic-50k/")
parser.add_argument("--training", type=str, default="datasets/laion-aesthetic-2-5k/")
parser.add_argument("--target", type=str, default="datasets/laion-aesthetic-2-5k-eval/")
args = parser.parse_args()
main(args)

0 comments on commit 4016a42

Please sign in to comment.