all the datasets prepared

caradryanl · May 15, 2024 · 4016a42 · 4016a42
1 parent d76cbd0
commit 4016a42
Show file tree

Hide file tree

Showing 7 changed files with 223 additions and 15 deletions.
diff --git a/diffusers/utils/get_celeba_hq_2_5k_eval_img_caption.py b/diffusers/utils/get_celeba_hq_2_5k_eval_img_caption.py
@@ -0,0 +1,53 @@
+import argparse
+import pandas as pd
+import json, os
+
+import requests
+from PIL import Image
+from io import BytesIO
+import base64
+
+
+
+def main(args):
+    dataset = args.dataset
+    target = args.target
+    num_images = args.num_images
+    # Load Parquet file into a pandas DataFrame
+    df = pd.read_parquet(dataset)
+    print(df.info())
+
+    if not os.path.exists(target + 'images/'):
+        os.mkdir(target + 'images/')
+
+    # 32600
+    caption = {}
+    for idx in range(num_images):
+        # print(df.iloc[0, 0])
+        # image_bytes = base64.b64decode(df.iloc[0, 0]['bytes'])
+        image_bytes = df.iloc[idx, 0]['bytes']
+        image = Image.open(BytesIO(image_bytes))
+        image = image.convert('RGB')  # Ensure it's in RGB mode for saving as JPEG
+        image.save(target + 'images/' + f'{idx}.png', 'PNG')
+
+        caption[idx] = {
+            "path": target + 'images/' + f'{idx}.png',
+            "height": 256,
+            "width": 256,
+            "caption": []
+        }
+    with open(target + 'caption.json', 'w') as file:
+        json.dump(caption, file, indent=4)
+    # with open(target + 'failure_download.json', 'w') as file:
+    #     json.dump(failure, file, indent=4)
+
+    # print("Available image number: {}".format(num_images - len(failure)))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-images", type=int, default=2500)
+    parser.add_argument("--dataset", type=str, default="datasets/celeba-hq-2-5k/train-00001-of-00006.parquet")
+    parser.add_argument("--target", type=str, default="datasets/celeba-hq-2-5k-eval/")
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/utils/get_coco_2_5k_eval_img_caption.py b/diffusers/utils/get_coco_2_5k_eval_img_caption.py
@@ -0,0 +1,48 @@
+import os, shutil
+import argparse
+import pandas as pd
+import json
+
+def main(args):
+    dataset = args.dataset
+    target = args.target
+    training = args.training
+    num_images = args.num_images
+
+    with open(dataset + 'caption.json', 'r') as json_file:
+        caption = json.load(json_file)
+
+    if not os.path.exists(target + 'images/'):
+        os.mkdir(target + 'images/')
+
+    training_list = os.listdir(training+ 'images/')
+
+    data = {}
+    cnt, in_cnt = 0, 0
+    for id, metadata in caption.items():
+        img_path = metadata["path"]
+        if img_path in training_list:
+            in_cnt += 1
+            continue
+
+        data[id] = metadata
+        source = dataset + 'images/' + img_path
+        dest = target + 'images/' + img_path
+        shutil.copy(source, dest)
+
+        cnt += 1
+        if cnt >= num_images:
+            break
+    print(cnt, in_cnt)
+    with open(target + 'caption.json', 'w') as file:
+        json.dump(data, file, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-images", type=int, default=2500)
+    parser.add_argument("--dataset", type=str, default="datasets/coco2017-val/")
+    parser.add_argument("--training", type=str, default="datasets/coco2017-val-2-5k/")
+    parser.add_argument("--target", type=str, default="datasets/coco2017-val-2-5k-eval/")
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/utils/get_datasets_eval.sh b/diffusers/utils/get_datasets_eval.sh
@@ -0,0 +1,4 @@
+python utils/get_laion_aesthetic_2_5k_eval_img_caption.py
+python utils/get_coco_2_5k_eval_img_caption.py
+python utils/get_celeba_hq_2_5k_eval_img_caption.py
+python utils/get_ffhq_2_5k_eval_img_caption.py
diff --git a/diffusers/utils/get_ffhq_2_5k_eval_img_caption.py b/diffusers/utils/get_ffhq_2_5k_eval_img_caption.py
@@ -0,0 +1,53 @@
+import argparse
+import pandas as pd
+import json,os
+
+import requests
+from PIL import Image
+from io import BytesIO
+import base64
+
+
+
+def main(args):
+    dataset = args.dataset
+    target = args.target
+    num_images = args.num_images
+    # Load Parquet file into a pandas DataFrame
+    df = pd.read_parquet(dataset)
+    print(df.info())
+
+    if not os.path.exists(target + 'images/'):
+        os.mkdir(target + 'images/')
+
+    # 32600
+    caption = {}
+    for idx in range(num_images):
+        # print(df.iloc[0, 0]['bytes'])
+        # image_bytes = base64.b64decode(df.iloc[0, 0]['bytes'])
+        image_bytes = df.iloc[idx, 0]['bytes']
+        image = Image.open(BytesIO(image_bytes))
+        image = image.convert('RGB')  # Ensure it's in RGB mode for saving as JPEG
+        image.save(target + 'images/' +df.iloc[idx, 0]['path'], 'PNG')
+
+        caption[idx] = {
+            "path": df.iloc[idx, 0]['path'],
+            "height": 256,
+            "width": 256,
+            "caption": []
+        }
+    with open(target + 'caption.json', 'w') as file:
+        json.dump(caption, file, indent=4)
+    # with open(target + 'failure_download.json', 'w') as file:
+    #     json.dump(failure, file, indent=4)
+
+    # print("Available image number: {}".format(num_images - len(failure)))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-images", type=int, default=2500)
+    parser.add_argument("--dataset", type=str, default="datasets/ffhq-2-5k/train-00001-of-00015-78537a18b94b8879.parquet")
+    parser.add_argument("--target", type=str, default="datasets/ffhq-2-5k-eval/")
+    args = parser.parse_args()
+    main(args)
diff --git a/diffusers/utils/get_hakubooru.py b/diffusers/utils/get_hakubooru.py
@@ -15,45 +15,43 @@
     logger.info("Loading danbooru2023.db")
     load_db("datasets/danbooru2023/metadata/danbooru2023.db")
 
-    num_samples = 10
-
-
     logger.info("Querying posts")
     # Querying posts for:
     # All the post after 5_000_000
     # 1/2 of the post before 5_000_000, after 3_000_000
     # 1/3 of the post before 3_000_000
     # Use seed_everything(1) to make the result reproducible
     seed_everything(1)
-    member_choosed_post = (
+    choosed_post = (
         list(Post.select().where(Post.id >= 5_000_000))
         + choices(
             Post.select().where(Post.id < 5_000_000, Post.id >= 3_000_000), k=1_000_000
         )
         + choices(Post.select().where(Post.id < 3_000_000), k=1_000_000)
     )
-    nonmember_choosed_post = (
-        [item for item in list(Post.select().where(Post.id < 5_000_000)) if item not in member_choosed_post]
-    )
 
     logger.info(f"Build exporter for members")
     exporter = Exporter(
         source=TarSource("datasets/danbooru2023/data"),
         saver=FileSaver("datasets/danbooru2023/images_member"),
         captioner=KohakuCaptioner(),
-        process_batch_size=100000,
+        process_batch_size=1000,
         process_threads=2,
     )
-    logger.info(f"Found {len(member_choosed_post)} posts")
+    logger.info(f"Found {len(choosed_post)} posts")
     logger.info(f"Exporting images for members")
-    exporter.export_posts(member_choosed_post)
+    exporter.export_posts(choosed_post)
+
+    nonmember_choosed_post = (
+        list(set(list(Post.select().where(Post.id < 5_000_000))) - set(choosed_post))
+    )
 
     logger.info(f"Build exporter for non-members")
     exporter = Exporter(
         source=TarSource("datasets/danbooru2023/data"),
         saver=FileSaver("datasets/danbooru2023/images_nonmember"),
         captioner=KohakuCaptioner(),
-        process_batch_size=100000,
+        process_batch_size=1000,
         process_threads=2,
     )
     logger.info(f"Found {len(nonmember_choosed_post)} posts")

diff --git a/diffusers/utils/get_hakubooru_2_5k_img_caption.py b/diffusers/utils/get_hakubooru_2_5k_img_caption.py
@@ -12,8 +12,8 @@ def list_files_with_extension(directory, extension):
 
     # Get a list of all files with the given extension
     files = glob.glob(os.path.join(directory, '**', '*' + extension), recursive=True)
-
-    return files
+    filenames = [os.path.basename(file) for file in files]
+    return filenames
 
 
 if __name__ == "__main__":
@@ -39,6 +39,7 @@ def list_files_with_extension(directory, extension):
         if cnt == num_samples:
             with open(target_datasets[0] + 'caption.json', 'w') as file:
                 json.dump(caption, file, indent=4)
+            caption = {}
 
         id = member_file[:-5]
         txt_file = id + '.txt'
@@ -57,11 +58,11 @@ def list_files_with_extension(directory, extension):
     with open(target_datasets[1] + 'caption.json', 'w') as file:
         json.dump(caption, file, indent=4)
 
-    # select hakubooru-member
+    # select hakubooru-nonmember
     nonmember_dataset = 'datasets/danbooru2023/images_nonmember/'
     target_datasets = ['datasets/hakubooru-2-5k-nonmember/', 'datasets/hakubooru-2-5k-eval-nonmember/']
     nonmember_files = list_files_with_extension(nonmember_dataset, '.webp')
-    print(f"number of members: {len(member_files)}")
+    print(f"number of nonmembers: {len(nonmember_files)}")
     cnt = 0
     caption = {}
     for nonmember_file in nonmember_files:
@@ -75,6 +76,7 @@ def list_files_with_extension(directory, extension):
         if cnt == num_samples:
             with open(target_datasets[0] + 'caption.json', 'w') as file:
                 json.dump(caption, file, indent=4)
+            caption = {}
 
         id = nonmember_file[:-5]
         txt_file = id + '.txt'

diff --git a/diffusers/utils/get_laion_aesthetic_2_5k_eval_img_caption.py b/diffusers/utils/get_laion_aesthetic_2_5k_eval_img_caption.py
@@ -0,0 +1,50 @@
+import os, shutil
+import argparse
+import pandas as pd
+import json
+
+def main(args):
+    dataset = args.dataset
+    target = args.target
+    num_images = args.num_images
+    training = args.training
+
+    with open(dataset + 'caption.json', 'r') as json_file:
+        caption = json.load(json_file)
+
+    if not os.path.exists(target + 'images/'):
+        os.mkdir(target + 'images/')
+
+    training_list = os.listdir(training+ 'images/')
+
+    data = {}
+    cnt, in_cnt = 0, 0
+    for id, metadata in caption.items():
+
+        img_path = metadata["path"]
+        if img_path in training_list:
+            in_cnt += 1
+            continue
+
+        data[id] = metadata
+        source = dataset + 'images/' + img_path
+        dest = target + 'images/' + img_path
+        shutil.copy(source, dest)
+
+
+        cnt += 1
+        if cnt >= num_images:
+            break
+    print(cnt, in_cnt)
+    with open(target + 'caption.json', 'w') as file:
+        json.dump(data, file, indent=4)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-images", type=int, default=2500)
+    parser.add_argument("--dataset", type=str, default="datasets/laion-aesthetic-50k/")
+    parser.add_argument("--training", type=str, default="datasets/laion-aesthetic-2-5k/")
+    parser.add_argument("--target", type=str, default="datasets/laion-aesthetic-2-5k-eval/")
+    args = parser.parse_args()
+    main(args)