From 87a078008c1f334520ba33ed654cb62c0f6a36d7 Mon Sep 17 00:00:00 2001
From: mzhaoshuai <zhaoshuaimcc@foxmail.com>
Date: Mon, 2 May 2022 19:38:17 +0800
Subject: [PATCH] add more akg

---
 README.md              |  24 +++++++--
 scripts/activitynet.sh |   6 +--
 scripts/msrvtt.sh      | 104 ++++++++++++++++++------------------
 scripts/msvd.sh        | 118 ++++++++++++++++++++---------------------
 scripts/run_docker.sh  |   2 +-
 5 files changed, 136 insertions(+), 118 deletions(-)
diff --git a/README.md b/README.md
index 9b783c9..6a7af44 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,13 @@
 [![License](https://img.shields.io/badge/license-CC--BY--NC%204.0-green)](https://creativecommons.org/licenses/by-nc/4.0/)
+[![arXiv](https://img.shields.io/badge/cs.CV-%09arXiv%3A2205.00823-red)](https://arxiv.org/abs/2205.00823)
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-activitynet)](https://paperswithcode.com/sota/video-retrieval-on-activitynet?p=centerclip-token-clustering-for-efficient)
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-msvd)](https://paperswithcode.com/sota/video-retrieval-on-msvd?p=centerclip-token-clustering-for-efficient)
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-lsmdc)](https://paperswithcode.com/sota/video-retrieval-on-lsmdc?p=centerclip-token-clustering-for-efficient)
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-msr-vtt-1ka)](https://paperswithcode.com/sota/video-retrieval-on-msr-vtt-1ka?p=centerclip-token-clustering-for-efficient)
 
 
 # CenterCLIP
@@ -32,7 +41,7 @@ and decent computation cost reduction on MSVD, MSRVTT, LSMDC, and ActivityNet th
 <div align="justify">
 
 This is the code for the paper 
-<a href="todo">
+<a href="https://arxiv.org/abs/2205.00823">
 CenterCLIP: Token Clustering for Efficient Text-Video Retrieval.
 </a>
 <br />
@@ -65,7 +74,7 @@ CenterCLIP: Token Clustering for Efficient Text-Video Retrieval.
 We are open to pull requests.
 
 
-## Results and Checkpoints
+## Results
 
 ### MSVD
 Experiments on MSVD need at least 2 RTX 3090 GPUs.
@@ -265,6 +274,8 @@ Results of checkpoints on LSMDC are the same as the paper's data.
 Checkpoints on MSR-VTT and MSVD come from middle stages of our work.
 They have comparable performance with the paper's results (CenterCLIP, ViT-B/32).
 
+Third-party reproduction and checkpoints are warmly welcomed.
+
 Each zip file contains 4 types of files
 
 * a checkpoint of the model, typically, named as `ckpt.best.pth.tar`
@@ -301,7 +312,7 @@ Corresponding settings are ready in the bash scripts.
   author    = {Shuai Zhao and Linchao Zhu and Xiaohan Wang and Yi Yang},
   title     = {CenterCLIP: Token Clustering for Efficient Text-Video Retrieval},
   booktitle = {{SIGIR} '22: The 45th International {ACM} {SIGIR} Conference on Research
-               and Development in Information Retrieval, July 11–15, 2022, Madrid, Spain},
+			   and Development in Information Retrieval, July 11–15, 2022, Madrid, Spain},
   year      = {2022},
 }
 ```
@@ -322,6 +333,13 @@ This project is under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for detai
 * [mlfoundations/open_clip](https://github.com/mlfoundations/open_clip)
 * [huggingface/transformers](https://github.com/huggingface/transformers)
 * [facebookresearch/pytorchvideo](https://github.com/facebookresearch/pytorchvideo)
+* [DeMoriarty/fast_pytorch_kmeans](https://github.com/DeMoriarty/fast_pytorch_kmeans)
+* [subhadarship/kmeans_pytorch](https://github.com/subhadarship/kmeans_pytorch)
+* [PyAV-Org/PyAV](https://github.com/PyAV-Org/PyAV)
+* [sallymmx/ActionCLIP](https://github.com/sallymmx/ActionCLIP)
+* [VideoNetworks/TokShift-Transformer](https://github.com/VideoNetworks/TokShift-Transformer)
+* [yjxiong/tsn-pytorch](https://github.com/yjxiong/tsn-pytorch)
+* [mit-han-lab/temporal-shift-module](https://github.com/mit-han-lab/temporal-shift-module)
 * [mzhaoshuai/Divide-and-Co-training](https://github.com/mzhaoshuai/Divide-and-Co-training)
 * [ZJULearning/RMI](https://github.com/ZJULearning/RMI)
 <!--te-->
\ No newline at end of file
diff --git a/scripts/activitynet.sh b/scripts/activitynet.sh
index 5afbf14..3d0ff12 100644
--- a/scripts/activitynet.sh
+++ b/scripts/activitynet.sh
@@ -102,7 +102,7 @@ do
 			cluster_inter=1
 			cluster_algo='kmediods++'
 			cluster_num_blocks='196 196 196 196 196 196 160 160 160 160 160 160'
-            target_frames_blocks='60 60 60 60 60 60 20 20 20 20 20 20'
+			target_frames_blocks='60 60 60 60 60 60 20 20 20 20 20 20'
 			;;
 		03 )
 			pretrained_clip_name=ViT-B/16
@@ -111,7 +111,7 @@ do
 			cluster_inter=1
 			cluster_algo='kmediods++'
 			cluster_num_blocks='196 196 196 196 196 196 160 160 160 160 160 160'
-            target_frames_blocks='60 60 60 60 60 60 15 15 15 15 15 15'
+			target_frames_blocks='60 60 60 60 60 60 15 15 15 15 15 15'
 			;;
 		05 )
 			pretrained_clip_name=ViT-B/16
@@ -120,7 +120,7 @@ do
 			cluster_inter=1
 			cluster_algo='kmediods++'
 			cluster_num_blocks='196 196 196 196 196 196 160 160 160 160 160 160'
-            target_frames_blocks='60 60 60 60 60 60 12 12 12 12 12 12'
+			target_frames_blocks='60 60 60 60 60 60 12 12 12 12 12 12'
 			;;
 		04 )
 			# 8 V100 32GB GPUs
diff --git a/scripts/msrvtt.sh b/scripts/msrvtt.sh
index a12d798..828d498 100644
--- a/scripts/msrvtt.sh
+++ b/scripts/msrvtt.sh
@@ -78,12 +78,12 @@ do
 		62 )
 			do_train=0
 			do_eval=1
-            train_csv=${DATA_PATH}/MSRVTT_train.7k.csv
+			train_csv=${DATA_PATH}/MSRVTT_train.7k.csv
 			lr=5e-3
 			optim=AdamW
 			cluster_inter=1
 			cluster_algo='kmediods++'
-            # this is a experiment at the middle stage of this work; in paper, we use minkowski_norm_p=2.0. 
+			# this is a experiment at the middle stage of this work; in paper, we use minkowski_norm_p=2.0. 
 			minkowski_norm_p=1.0
 			cluster_num_blocks='49 49 49 49 49 49 49 49 49 49 49 49'
 			target_frames_blocks='12 12 12 12 12 12 6 6 6 6 6 6'
@@ -94,7 +94,7 @@ do
 		63 )
 			do_train=0
 			do_eval=1
-            train_csv=${DATA_PATH}/MSRVTT_train.7k.csv
+			train_csv=${DATA_PATH}/MSRVTT_train.7k.csv
 			lr=5e-3
 			optim=AdamW
 			cluster_inter=1
@@ -109,7 +109,7 @@ do
 		80 )
 			do_train=0
 			do_eval=1
-            train_csv=${DATA_PATH}/MSRVTT_train.7k.csv
+			train_csv=${DATA_PATH}/MSRVTT_train.7k.csv
 			lr=5e-3
 			optim=AdamW
 			cluster_inter=1
@@ -121,61 +121,61 @@ do
 			resume=${HOME}/models/eclip/eclip_${dataset}_${num}/ckpt.best.pth.tar
 			# save_feature_path=${HOME}/output/lsmdc_id_09
 			;;
-        * )
-            ;;
-    esac
+		* )
+			;;
+	esac
 
 model_dir=${HOME}/models/eclip/eclip_${dataset}_${num}
 echo "The model dir is ${model_dir}"
 
 # CUDA_LAUNCH_BLOCKING=1
 python ../main.py \
-        --do_train ${do_train} \
-        --do_eval ${do_eval} \
-        --num_thread_reader ${num_workers} \
-        --epochs ${epochs} \
-        --batch_size ${batch_size} \
-        --n_display ${n_display} \
-        --lmdb_dataset ${lmdb_dataset} \
-        --train_csv ${train_csv} \
-        --val_csv ${val_csv} \
-        --data_path ${data_path} \
-        --features_path ${features_path} \
-        --output_dir ${model_dir} \
-        --optim ${optim} \
-        --lr ${lr} \
-        --coef_lr ${coef_lr} \
-        --wd ${wd} \
-        --max_words ${max_words} \
-        --max_frames ${max_frames} \
-        --batch_size_val ${batch_size_val} \
-        --datatype ${dataset} \
-        --expand_msrvtt_sentences  \
-        --feature_framerate ${fps} \
-        --freeze_layer_num 0  \
-        --slice_framepos 2 \
-        --loose_type \
-        --linear_patch 2d \
-        --sim_header meanP \
-        --pretrained_clip_name ${pretrained_clip_name} \
-        --precision ${precision} \
-        --init_method ${init_method} \
+		--do_train ${do_train} \
+		--do_eval ${do_eval} \
+		--num_thread_reader ${num_workers} \
+		--epochs ${epochs} \
+		--batch_size ${batch_size} \
+		--n_display ${n_display} \
+		--lmdb_dataset ${lmdb_dataset} \
+		--train_csv ${train_csv} \
+		--val_csv ${val_csv} \
+		--data_path ${data_path} \
+		--features_path ${features_path} \
+		--output_dir ${model_dir} \
+		--optim ${optim} \
+		--lr ${lr} \
+		--coef_lr ${coef_lr} \
+		--wd ${wd} \
+		--max_words ${max_words} \
+		--max_frames ${max_frames} \
+		--batch_size_val ${batch_size_val} \
+		--datatype ${dataset} \
+		--expand_msrvtt_sentences  \
+		--feature_framerate ${fps} \
+		--freeze_layer_num 0  \
+		--slice_framepos 2 \
+		--loose_type \
+		--linear_patch 2d \
+		--sim_header meanP \
+		--pretrained_clip_name ${pretrained_clip_name} \
+		--precision ${precision} \
+		--init_method ${init_method} \
 		--pretrained_dir ${pretrained_dir} \
-        --cluster_algo ${cluster_algo} \
-        --cluster_threshold ${cluster_threshold} \
-        --cluster_distance ${cluster_distance} \
-        --minkowski_norm_p ${minkowski_norm_p} \
-        --cluster_iter_limit ${cluster_iter_limit} \
-        --cluster_inter ${cluster_inter} \
-        --cluster_embedding ${cluster_embedding} \
-        --cluster_frame_embedding ${cluster_frame_embedding} \
-        --cluster_num_blocks ${cluster_num_blocks} \
-        --target_frames_blocks ${target_frames_blocks} \
-        --deep_cluster ${deep_cluster} \
-        --freeze_clip ${freeze_clip} \
-        --resume ${resume} \
-        --load_from_pretrained ${load_from_pretrained} \
-        --camoe_dsl ${camoe_dsl} 
+		--cluster_algo ${cluster_algo} \
+		--cluster_threshold ${cluster_threshold} \
+		--cluster_distance ${cluster_distance} \
+		--minkowski_norm_p ${minkowski_norm_p} \
+		--cluster_iter_limit ${cluster_iter_limit} \
+		--cluster_inter ${cluster_inter} \
+		--cluster_embedding ${cluster_embedding} \
+		--cluster_frame_embedding ${cluster_frame_embedding} \
+		--cluster_num_blocks ${cluster_num_blocks} \
+		--target_frames_blocks ${target_frames_blocks} \
+		--deep_cluster ${deep_cluster} \
+		--freeze_clip ${freeze_clip} \
+		--resume ${resume} \
+		--load_from_pretrained ${load_from_pretrained} \
+		--camoe_dsl ${camoe_dsl} 
 
 done
 
diff --git a/scripts/msvd.sh b/scripts/msvd.sh
index 2c96287..1ee8922 100644
--- a/scripts/msvd.sh
+++ b/scripts/msvd.sh
@@ -70,74 +70,74 @@ for num in 22
 do
 	case ${num} in
 		22 )
-            do_train=0
-            do_eval=1
+			do_train=0
+			do_eval=1
 			resume=${HOME}/models/eclip/eclip_${dataset}_${num}/ckpt.best.pth.tar
-            lr=5e-3
-            optim=AdamW
-            cluster_inter=1
-            cluster_algo='kmediods++'
-            minkowski_norm_p=2.0
-            cluster_num_blocks='49 49 49 49 49 49 49 49 49 49 49 49'
-            target_frames_blocks='12 12 12 12 12 12 4 4 4 4 4 4'
+			lr=5e-3
+			optim=AdamW
+			cluster_inter=1
+			cluster_algo='kmediods++'
+			minkowski_norm_p=2.0
+			cluster_num_blocks='49 49 49 49 49 49 49 49 49 49 49 49'
+			target_frames_blocks='12 12 12 12 12 12 4 4 4 4 4 4'
 			;;
-        * )
-            ;;
-    esac
+		* )
+			;;
+	esac
 
 model_dir=${HOME}/models/eclip/eclip_${dataset}_${num}
 echo "The model dir is ${model_dir}"
 
 # CUDA_LAUNCH_BLOCKING=1
 python ../main.py \
-        --do_train ${do_train} \
-        --do_eval ${do_eval} \
-        --num_thread_reader ${num_workers} \
-        --epochs ${epochs} \
-        --batch_size ${batch_size} \
-        --n_display ${n_display} \
-        --lmdb_dataset ${lmdb_dataset} \
-        --train_csv ${train_csv} \
-        --val_csv ${DATA_PATH}/MSRVTT_JSFUSION_test.csv \
-        --data_path ${data_path} \
-        --features_path ${features_path} \
-        --output_dir ${model_dir} \
-        --optim ${optim} \
-        --lr ${lr} \
-        --coef_lr ${coef_lr} \
-        --wd ${wd} \
-        --max_words 32 \
-        --max_frames ${max_frames} \
-        --batch_size_val ${batch_size_val} \
-        --datatype ${dataset} \
-        --expand_msrvtt_sentences  \
-        --feature_framerate ${fps} \
-        --freeze_layer_num 0  \
-        --slice_framepos 2 \
-        --loose_type \
-        --linear_patch 2d \
-        --sim_header meanP \
-        --pretrained_clip_name ${pretrained_clip_name} \
-        --precision ${precision} \
-        --init_method ${init_method} \
+		--do_train ${do_train} \
+		--do_eval ${do_eval} \
+		--num_thread_reader ${num_workers} \
+		--epochs ${epochs} \
+		--batch_size ${batch_size} \
+		--n_display ${n_display} \
+		--lmdb_dataset ${lmdb_dataset} \
+		--train_csv ${train_csv} \
+		--val_csv ${DATA_PATH}/MSRVTT_JSFUSION_test.csv \
+		--data_path ${data_path} \
+		--features_path ${features_path} \
+		--output_dir ${model_dir} \
+		--optim ${optim} \
+		--lr ${lr} \
+		--coef_lr ${coef_lr} \
+		--wd ${wd} \
+		--max_words 32 \
+		--max_frames ${max_frames} \
+		--batch_size_val ${batch_size_val} \
+		--datatype ${dataset} \
+		--expand_msrvtt_sentences  \
+		--feature_framerate ${fps} \
+		--freeze_layer_num 0  \
+		--slice_framepos 2 \
+		--loose_type \
+		--linear_patch 2d \
+		--sim_header meanP \
+		--pretrained_clip_name ${pretrained_clip_name} \
+		--precision ${precision} \
+		--init_method ${init_method} \
 		--pretrained_dir ${pretrained_dir} \
-        --cluster_algo ${cluster_algo} \
-        --cluster_threshold ${cluster_threshold} \
-        --cluster_distance ${cluster_distance} \
-        --minkowski_norm_p ${minkowski_norm_p} \
-        --cluster_iter_limit ${cluster_iter_limit} \
-        --cluster_inter ${cluster_inter} \
-        --cluster_embedding ${cluster_embedding} \
-        --cluster_frame_embedding ${cluster_frame_embedding} \
-        --cluster_num_blocks ${cluster_num_blocks} \
-        --target_frames_blocks ${target_frames_blocks} \
-        --deep_cluster ${deep_cluster} \
-        --spectral_sigma ${spectral_sigma} \
-        --spectral_graph ${spectral_graph} \
-        --spectral_knn_k ${spectral_knn_k} \
-        --freeze_clip ${freeze_clip} \
-        --resume ${resume} \
-        --load_from_pretrained ${load_from_pretrained}
+		--cluster_algo ${cluster_algo} \
+		--cluster_threshold ${cluster_threshold} \
+		--cluster_distance ${cluster_distance} \
+		--minkowski_norm_p ${minkowski_norm_p} \
+		--cluster_iter_limit ${cluster_iter_limit} \
+		--cluster_inter ${cluster_inter} \
+		--cluster_embedding ${cluster_embedding} \
+		--cluster_frame_embedding ${cluster_frame_embedding} \
+		--cluster_num_blocks ${cluster_num_blocks} \
+		--target_frames_blocks ${target_frames_blocks} \
+		--deep_cluster ${deep_cluster} \
+		--spectral_sigma ${spectral_sigma} \
+		--spectral_graph ${spectral_graph} \
+		--spectral_knn_k ${spectral_knn_k} \
+		--freeze_clip ${freeze_clip} \
+		--resume ${resume} \
+		--load_from_pretrained ${load_from_pretrained}
 
 done
 
diff --git a/scripts/run_docker.sh b/scripts/run_docker.sh
index 5f63259..eb9f007 100644
--- a/scripts/run_docker.sh
+++ b/scripts/run_docker.sh
@@ -25,7 +25,7 @@ docker_final_image="$docker_image$docker_image_tag"
 						--shm-size 64G \
 						--memory-reservation 120G \
 						-v /home/${USER}:/home/${USER} --user=${UID}:${GID} -w ${DIR_NOW}/.. \
-                        -v /data1:/data1 \
+						-v /data1:/data1 \
 						-v /etc/group:/etc/group:ro -v /etc/passwd:/etc/passwd:ro \
 						-p $docker_image_port:$docker_image_port $docker_final_image bash