Skip to content

Commit

Permalink
Change dataset download scripts to use Cloudflare buckets directly in…
Browse files Browse the repository at this point in the history
…stead of going through Nextcloud (#712)
  • Loading branch information
morphine00 authored Mar 12, 2024
1 parent e237206 commit 68f8f38
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ while [ "$1" != "" ]; do
done

mkdir -p ${OUTPUT_DIR}
wget -O ${OUTPUT_DIR}/val2014_30k.tsv -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/coco2014&files=val2014_30k.tsv"

rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com

rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k.tsv ${OUTPUT_DIR} -P

Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ while [ "$1" != "" ]; do
done

mkdir -p ${OUTPUT_DIR}
wget -O ${OUTPUT_DIR}/val2014_30k_stats.npz -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/coco2014&files=val2014_30k_stats.npz"

rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com

rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k_stats.npz ${OUTPUT_DIR} -P

Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ mkdir -p ${OUTPUT_DIR}
cd ${OUTPUT_DIR}


for i in {00000..00831}; do wget -O ${OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/images-webdataset-filtered&files=${i}.tar"; done
rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com

wget -O ${OUTPUT_DIR}/sha512sums.txt -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/images-webdataset-filtered&files=sha512sums.txt"
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/images-webdataset-filtered/ ${OUTPUT_DIR} --include="*.tar" -P

rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/images-webdataset-filtered/sha512sums.txt ${OUTPUT_DIR} -P

sha512sum --quiet -c sha512sums.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ mkdir -p ${OUTPUT_DIR}
cd ${OUTPUT_DIR}


for i in {00000..00831}; do wget -O ${OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=${i}.tar"; done
rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com

wget -O ${OUTPUT_DIR}/sha512sums.txt -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=sha512sums.txt"
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/ ${OUTPUT_DIR} --include="*.tar" -P

rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/sha512sums.txt ${OUTPUT_DIR} -P

sha512sum --quiet -c sha512sums.txt

0 comments on commit 68f8f38

Please sign in to comment.