diff --git a/diffusers/utils/get_celeba_hq_2_5k_eval_img_caption.py b/diffusers/utils/get_celeba_hq_2_5k_eval_img_caption.py new file mode 100644 index 0000000..55869ff --- /dev/null +++ b/diffusers/utils/get_celeba_hq_2_5k_eval_img_caption.py @@ -0,0 +1,53 @@ +import argparse +import pandas as pd +import json, os + +import requests +from PIL import Image +from io import BytesIO +import base64 + + + +def main(args): + dataset = args.dataset + target = args.target + num_images = args.num_images + # Load Parquet file into a pandas DataFrame + df = pd.read_parquet(dataset) + print(df.info()) + + if not os.path.exists(target + 'images/'): + os.mkdir(target + 'images/') + + # 32600 + caption = {} + for idx in range(num_images): + # print(df.iloc[0, 0]) + # image_bytes = base64.b64decode(df.iloc[0, 0]['bytes']) + image_bytes = df.iloc[idx, 0]['bytes'] + image = Image.open(BytesIO(image_bytes)) + image = image.convert('RGB') # Ensure it's in RGB mode for saving as JPEG + image.save(target + 'images/' + f'{idx}.png', 'PNG') + + caption[idx] = { + "path": target + 'images/' + f'{idx}.png', + "height": 256, + "width": 256, + "caption": [] + } + with open(target + 'caption.json', 'w') as file: + json.dump(caption, file, indent=4) + # with open(target + 'failure_download.json', 'w') as file: + # json.dump(failure, file, indent=4) + + # print("Available image number: {}".format(num_images - len(failure))) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--num-images", type=int, default=2500) + parser.add_argument("--dataset", type=str, default="datasets/celeba-hq-2-5k/train-00001-of-00006.parquet") + parser.add_argument("--target", type=str, default="datasets/celeba-hq-2-5k-eval/") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/diffusers/utils/get_coco_2_5k_eval_img_caption.py b/diffusers/utils/get_coco_2_5k_eval_img_caption.py new file mode 100644 index 0000000..0934062 --- /dev/null +++ b/diffusers/utils/get_coco_2_5k_eval_img_caption.py @@ -0,0 +1,48 @@ +import os, shutil +import argparse +import pandas as pd +import json + +def main(args): + dataset = args.dataset + target = args.target + training = args.training + num_images = args.num_images + + with open(dataset + 'caption.json', 'r') as json_file: + caption = json.load(json_file) + + if not os.path.exists(target + 'images/'): + os.mkdir(target + 'images/') + + training_list = os.listdir(training+ 'images/') + + data = {} + cnt, in_cnt = 0, 0 + for id, metadata in caption.items(): + img_path = metadata["path"] + if img_path in training_list: + in_cnt += 1 + continue + + data[id] = metadata + source = dataset + 'images/' + img_path + dest = target + 'images/' + img_path + shutil.copy(source, dest) + + cnt += 1 + if cnt >= num_images: + break + print(cnt, in_cnt) + with open(target + 'caption.json', 'w') as file: + json.dump(data, file, indent=4) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--num-images", type=int, default=2500) + parser.add_argument("--dataset", type=str, default="datasets/coco2017-val/") + parser.add_argument("--training", type=str, default="datasets/coco2017-val-2-5k/") + parser.add_argument("--target", type=str, default="datasets/coco2017-val-2-5k-eval/") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/diffusers/utils/get_datasets_eval.sh b/diffusers/utils/get_datasets_eval.sh new file mode 100755 index 0000000..abba468 --- /dev/null +++ b/diffusers/utils/get_datasets_eval.sh @@ -0,0 +1,4 @@ +python utils/get_laion_aesthetic_2_5k_eval_img_caption.py +python utils/get_coco_2_5k_eval_img_caption.py +python utils/get_celeba_hq_2_5k_eval_img_caption.py +python utils/get_ffhq_2_5k_eval_img_caption.py \ No newline at end of file diff --git a/diffusers/utils/get_ffhq_2_5k_eval_img_caption.py b/diffusers/utils/get_ffhq_2_5k_eval_img_caption.py new file mode 100644 index 0000000..b67c046 --- /dev/null +++ b/diffusers/utils/get_ffhq_2_5k_eval_img_caption.py @@ -0,0 +1,53 @@ +import argparse +import pandas as pd +import json,os + +import requests +from PIL import Image +from io import BytesIO +import base64 + + + +def main(args): + dataset = args.dataset + target = args.target + num_images = args.num_images + # Load Parquet file into a pandas DataFrame + df = pd.read_parquet(dataset) + print(df.info()) + + if not os.path.exists(target + 'images/'): + os.mkdir(target + 'images/') + + # 32600 + caption = {} + for idx in range(num_images): + # print(df.iloc[0, 0]['bytes']) + # image_bytes = base64.b64decode(df.iloc[0, 0]['bytes']) + image_bytes = df.iloc[idx, 0]['bytes'] + image = Image.open(BytesIO(image_bytes)) + image = image.convert('RGB') # Ensure it's in RGB mode for saving as JPEG + image.save(target + 'images/' +df.iloc[idx, 0]['path'], 'PNG') + + caption[idx] = { + "path": df.iloc[idx, 0]['path'], + "height": 256, + "width": 256, + "caption": [] + } + with open(target + 'caption.json', 'w') as file: + json.dump(caption, file, indent=4) + # with open(target + 'failure_download.json', 'w') as file: + # json.dump(failure, file, indent=4) + + # print("Available image number: {}".format(num_images - len(failure))) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--num-images", type=int, default=2500) + parser.add_argument("--dataset", type=str, default="datasets/ffhq-2-5k/train-00001-of-00015-78537a18b94b8879.parquet") + parser.add_argument("--target", type=str, default="datasets/ffhq-2-5k-eval/") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/diffusers/utils/get_hakubooru.py b/diffusers/utils/get_hakubooru.py index 4e1b02f..a96527b 100644 --- a/diffusers/utils/get_hakubooru.py +++ b/diffusers/utils/get_hakubooru.py @@ -15,9 +15,6 @@ logger.info("Loading danbooru2023.db") load_db("datasets/danbooru2023/metadata/danbooru2023.db") - num_samples = 10 - - logger.info("Querying posts") # Querying posts for: # All the post after 5_000_000 @@ -25,35 +22,36 @@ # 1/3 of the post before 3_000_000 # Use seed_everything(1) to make the result reproducible seed_everything(1) - member_choosed_post = ( + choosed_post = ( list(Post.select().where(Post.id >= 5_000_000)) + choices( Post.select().where(Post.id < 5_000_000, Post.id >= 3_000_000), k=1_000_000 ) + choices(Post.select().where(Post.id < 3_000_000), k=1_000_000) ) - nonmember_choosed_post = ( - [item for item in list(Post.select().where(Post.id < 5_000_000)) if item not in member_choosed_post] - ) logger.info(f"Build exporter for members") exporter = Exporter( source=TarSource("datasets/danbooru2023/data"), saver=FileSaver("datasets/danbooru2023/images_member"), captioner=KohakuCaptioner(), - process_batch_size=100000, + process_batch_size=1000, process_threads=2, ) - logger.info(f"Found {len(member_choosed_post)} posts") + logger.info(f"Found {len(choosed_post)} posts") logger.info(f"Exporting images for members") - exporter.export_posts(member_choosed_post) + exporter.export_posts(choosed_post) + + nonmember_choosed_post = ( + list(set(list(Post.select().where(Post.id < 5_000_000))) - set(choosed_post)) + ) logger.info(f"Build exporter for non-members") exporter = Exporter( source=TarSource("datasets/danbooru2023/data"), saver=FileSaver("datasets/danbooru2023/images_nonmember"), captioner=KohakuCaptioner(), - process_batch_size=100000, + process_batch_size=1000, process_threads=2, ) logger.info(f"Found {len(nonmember_choosed_post)} posts") diff --git a/diffusers/utils/get_hakubooru_2_5k_img_caption.py b/diffusers/utils/get_hakubooru_2_5k_img_caption.py index ef16a6f..0389c4f 100644 --- a/diffusers/utils/get_hakubooru_2_5k_img_caption.py +++ b/diffusers/utils/get_hakubooru_2_5k_img_caption.py @@ -12,8 +12,8 @@ def list_files_with_extension(directory, extension): # Get a list of all files with the given extension files = glob.glob(os.path.join(directory, '**', '*' + extension), recursive=True) - - return files + filenames = [os.path.basename(file) for file in files] + return filenames if __name__ == "__main__": @@ -39,6 +39,7 @@ def list_files_with_extension(directory, extension): if cnt == num_samples: with open(target_datasets[0] + 'caption.json', 'w') as file: json.dump(caption, file, indent=4) + caption = {} id = member_file[:-5] txt_file = id + '.txt' @@ -57,11 +58,11 @@ def list_files_with_extension(directory, extension): with open(target_datasets[1] + 'caption.json', 'w') as file: json.dump(caption, file, indent=4) - # select hakubooru-member + # select hakubooru-nonmember nonmember_dataset = 'datasets/danbooru2023/images_nonmember/' target_datasets = ['datasets/hakubooru-2-5k-nonmember/', 'datasets/hakubooru-2-5k-eval-nonmember/'] nonmember_files = list_files_with_extension(nonmember_dataset, '.webp') - print(f"number of members: {len(member_files)}") + print(f"number of nonmembers: {len(nonmember_files)}") cnt = 0 caption = {} for nonmember_file in nonmember_files: @@ -75,6 +76,7 @@ def list_files_with_extension(directory, extension): if cnt == num_samples: with open(target_datasets[0] + 'caption.json', 'w') as file: json.dump(caption, file, indent=4) + caption = {} id = nonmember_file[:-5] txt_file = id + '.txt' diff --git a/diffusers/utils/get_laion_aesthetic_2_5k_eval_img_caption.py b/diffusers/utils/get_laion_aesthetic_2_5k_eval_img_caption.py new file mode 100644 index 0000000..b662ca8 --- /dev/null +++ b/diffusers/utils/get_laion_aesthetic_2_5k_eval_img_caption.py @@ -0,0 +1,50 @@ +import os, shutil +import argparse +import pandas as pd +import json + +def main(args): + dataset = args.dataset + target = args.target + num_images = args.num_images + training = args.training + + with open(dataset + 'caption.json', 'r') as json_file: + caption = json.load(json_file) + + if not os.path.exists(target + 'images/'): + os.mkdir(target + 'images/') + + training_list = os.listdir(training+ 'images/') + + data = {} + cnt, in_cnt = 0, 0 + for id, metadata in caption.items(): + + img_path = metadata["path"] + if img_path in training_list: + in_cnt += 1 + continue + + data[id] = metadata + source = dataset + 'images/' + img_path + dest = target + 'images/' + img_path + shutil.copy(source, dest) + + + cnt += 1 + if cnt >= num_images: + break + print(cnt, in_cnt) + with open(target + 'caption.json', 'w') as file: + json.dump(data, file, indent=4) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--num-images", type=int, default=2500) + parser.add_argument("--dataset", type=str, default="datasets/laion-aesthetic-50k/") + parser.add_argument("--training", type=str, default="datasets/laion-aesthetic-2-5k/") + parser.add_argument("--target", type=str, default="datasets/laion-aesthetic-2-5k-eval/") + args = parser.parse_args() + main(args) \ No newline at end of file