From 87a078008c1f334520ba33ed654cb62c0f6a36d7 Mon Sep 17 00:00:00 2001 From: mzhaoshuai Date: Mon, 2 May 2022 19:38:17 +0800 Subject: [PATCH] add more akg --- README.md | 24 +++++++-- scripts/activitynet.sh | 6 +-- scripts/msrvtt.sh | 104 ++++++++++++++++++------------------ scripts/msvd.sh | 118 ++++++++++++++++++++--------------------- scripts/run_docker.sh | 2 +- 5 files changed, 136 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 9b783c9..6a7af44 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ [![License](https://img.shields.io/badge/license-CC--BY--NC%204.0-green)](https://creativecommons.org/licenses/by-nc/4.0/) +[![arXiv](https://img.shields.io/badge/cs.CV-%09arXiv%3A2205.00823-red)](https://arxiv.org/abs/2205.00823) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-activitynet)](https://paperswithcode.com/sota/video-retrieval-on-activitynet?p=centerclip-token-clustering-for-efficient) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-msvd)](https://paperswithcode.com/sota/video-retrieval-on-msvd?p=centerclip-token-clustering-for-efficient) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-lsmdc)](https://paperswithcode.com/sota/video-retrieval-on-lsmdc?p=centerclip-token-clustering-for-efficient) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/centerclip-token-clustering-for-efficient/video-retrieval-on-msr-vtt-1ka)](https://paperswithcode.com/sota/video-retrieval-on-msr-vtt-1ka?p=centerclip-token-clustering-for-efficient) # CenterCLIP @@ -32,7 +41,7 @@ and decent computation cost reduction on MSVD, MSRVTT, LSMDC, and ActivityNet th
This is the code for the paper - + CenterCLIP: Token Clustering for Efficient Text-Video Retrieval.
@@ -65,7 +74,7 @@ CenterCLIP: Token Clustering for Efficient Text-Video Retrieval. We are open to pull requests. -## Results and Checkpoints +## Results ### MSVD Experiments on MSVD need at least 2 RTX 3090 GPUs. @@ -265,6 +274,8 @@ Results of checkpoints on LSMDC are the same as the paper's data. Checkpoints on MSR-VTT and MSVD come from middle stages of our work. They have comparable performance with the paper's results (CenterCLIP, ViT-B/32). +Third-party reproduction and checkpoints are warmly welcomed. + Each zip file contains 4 types of files * a checkpoint of the model, typically, named as `ckpt.best.pth.tar` @@ -301,7 +312,7 @@ Corresponding settings are ready in the bash scripts. author = {Shuai Zhao and Linchao Zhu and Xiaohan Wang and Yi Yang}, title = {CenterCLIP: Token Clustering for Efficient Text-Video Retrieval}, booktitle = {{SIGIR} '22: The 45th International {ACM} {SIGIR} Conference on Research - and Development in Information Retrieval, July 11–15, 2022, Madrid, Spain}, + and Development in Information Retrieval, July 11–15, 2022, Madrid, Spain}, year = {2022}, } ``` @@ -322,6 +333,13 @@ This project is under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for detai * [mlfoundations/open_clip](https://github.com/mlfoundations/open_clip) * [huggingface/transformers](https://github.com/huggingface/transformers) * [facebookresearch/pytorchvideo](https://github.com/facebookresearch/pytorchvideo) +* [DeMoriarty/fast_pytorch_kmeans](https://github.com/DeMoriarty/fast_pytorch_kmeans) +* [subhadarship/kmeans_pytorch](https://github.com/subhadarship/kmeans_pytorch) +* [PyAV-Org/PyAV](https://github.com/PyAV-Org/PyAV) +* [sallymmx/ActionCLIP](https://github.com/sallymmx/ActionCLIP) +* [VideoNetworks/TokShift-Transformer](https://github.com/VideoNetworks/TokShift-Transformer) +* [yjxiong/tsn-pytorch](https://github.com/yjxiong/tsn-pytorch) +* [mit-han-lab/temporal-shift-module](https://github.com/mit-han-lab/temporal-shift-module) * [mzhaoshuai/Divide-and-Co-training](https://github.com/mzhaoshuai/Divide-and-Co-training) * [ZJULearning/RMI](https://github.com/ZJULearning/RMI) \ No newline at end of file diff --git a/scripts/activitynet.sh b/scripts/activitynet.sh index 5afbf14..3d0ff12 100644 --- a/scripts/activitynet.sh +++ b/scripts/activitynet.sh @@ -102,7 +102,7 @@ do cluster_inter=1 cluster_algo='kmediods++' cluster_num_blocks='196 196 196 196 196 196 160 160 160 160 160 160' - target_frames_blocks='60 60 60 60 60 60 20 20 20 20 20 20' + target_frames_blocks='60 60 60 60 60 60 20 20 20 20 20 20' ;; 03 ) pretrained_clip_name=ViT-B/16 @@ -111,7 +111,7 @@ do cluster_inter=1 cluster_algo='kmediods++' cluster_num_blocks='196 196 196 196 196 196 160 160 160 160 160 160' - target_frames_blocks='60 60 60 60 60 60 15 15 15 15 15 15' + target_frames_blocks='60 60 60 60 60 60 15 15 15 15 15 15' ;; 05 ) pretrained_clip_name=ViT-B/16 @@ -120,7 +120,7 @@ do cluster_inter=1 cluster_algo='kmediods++' cluster_num_blocks='196 196 196 196 196 196 160 160 160 160 160 160' - target_frames_blocks='60 60 60 60 60 60 12 12 12 12 12 12' + target_frames_blocks='60 60 60 60 60 60 12 12 12 12 12 12' ;; 04 ) # 8 V100 32GB GPUs diff --git a/scripts/msrvtt.sh b/scripts/msrvtt.sh index a12d798..828d498 100644 --- a/scripts/msrvtt.sh +++ b/scripts/msrvtt.sh @@ -78,12 +78,12 @@ do 62 ) do_train=0 do_eval=1 - train_csv=${DATA_PATH}/MSRVTT_train.7k.csv + train_csv=${DATA_PATH}/MSRVTT_train.7k.csv lr=5e-3 optim=AdamW cluster_inter=1 cluster_algo='kmediods++' - # this is a experiment at the middle stage of this work; in paper, we use minkowski_norm_p=2.0. + # this is a experiment at the middle stage of this work; in paper, we use minkowski_norm_p=2.0. minkowski_norm_p=1.0 cluster_num_blocks='49 49 49 49 49 49 49 49 49 49 49 49' target_frames_blocks='12 12 12 12 12 12 6 6 6 6 6 6' @@ -94,7 +94,7 @@ do 63 ) do_train=0 do_eval=1 - train_csv=${DATA_PATH}/MSRVTT_train.7k.csv + train_csv=${DATA_PATH}/MSRVTT_train.7k.csv lr=5e-3 optim=AdamW cluster_inter=1 @@ -109,7 +109,7 @@ do 80 ) do_train=0 do_eval=1 - train_csv=${DATA_PATH}/MSRVTT_train.7k.csv + train_csv=${DATA_PATH}/MSRVTT_train.7k.csv lr=5e-3 optim=AdamW cluster_inter=1 @@ -121,61 +121,61 @@ do resume=${HOME}/models/eclip/eclip_${dataset}_${num}/ckpt.best.pth.tar # save_feature_path=${HOME}/output/lsmdc_id_09 ;; - * ) - ;; - esac + * ) + ;; + esac model_dir=${HOME}/models/eclip/eclip_${dataset}_${num} echo "The model dir is ${model_dir}" # CUDA_LAUNCH_BLOCKING=1 python ../main.py \ - --do_train ${do_train} \ - --do_eval ${do_eval} \ - --num_thread_reader ${num_workers} \ - --epochs ${epochs} \ - --batch_size ${batch_size} \ - --n_display ${n_display} \ - --lmdb_dataset ${lmdb_dataset} \ - --train_csv ${train_csv} \ - --val_csv ${val_csv} \ - --data_path ${data_path} \ - --features_path ${features_path} \ - --output_dir ${model_dir} \ - --optim ${optim} \ - --lr ${lr} \ - --coef_lr ${coef_lr} \ - --wd ${wd} \ - --max_words ${max_words} \ - --max_frames ${max_frames} \ - --batch_size_val ${batch_size_val} \ - --datatype ${dataset} \ - --expand_msrvtt_sentences \ - --feature_framerate ${fps} \ - --freeze_layer_num 0 \ - --slice_framepos 2 \ - --loose_type \ - --linear_patch 2d \ - --sim_header meanP \ - --pretrained_clip_name ${pretrained_clip_name} \ - --precision ${precision} \ - --init_method ${init_method} \ + --do_train ${do_train} \ + --do_eval ${do_eval} \ + --num_thread_reader ${num_workers} \ + --epochs ${epochs} \ + --batch_size ${batch_size} \ + --n_display ${n_display} \ + --lmdb_dataset ${lmdb_dataset} \ + --train_csv ${train_csv} \ + --val_csv ${val_csv} \ + --data_path ${data_path} \ + --features_path ${features_path} \ + --output_dir ${model_dir} \ + --optim ${optim} \ + --lr ${lr} \ + --coef_lr ${coef_lr} \ + --wd ${wd} \ + --max_words ${max_words} \ + --max_frames ${max_frames} \ + --batch_size_val ${batch_size_val} \ + --datatype ${dataset} \ + --expand_msrvtt_sentences \ + --feature_framerate ${fps} \ + --freeze_layer_num 0 \ + --slice_framepos 2 \ + --loose_type \ + --linear_patch 2d \ + --sim_header meanP \ + --pretrained_clip_name ${pretrained_clip_name} \ + --precision ${precision} \ + --init_method ${init_method} \ --pretrained_dir ${pretrained_dir} \ - --cluster_algo ${cluster_algo} \ - --cluster_threshold ${cluster_threshold} \ - --cluster_distance ${cluster_distance} \ - --minkowski_norm_p ${minkowski_norm_p} \ - --cluster_iter_limit ${cluster_iter_limit} \ - --cluster_inter ${cluster_inter} \ - --cluster_embedding ${cluster_embedding} \ - --cluster_frame_embedding ${cluster_frame_embedding} \ - --cluster_num_blocks ${cluster_num_blocks} \ - --target_frames_blocks ${target_frames_blocks} \ - --deep_cluster ${deep_cluster} \ - --freeze_clip ${freeze_clip} \ - --resume ${resume} \ - --load_from_pretrained ${load_from_pretrained} \ - --camoe_dsl ${camoe_dsl} + --cluster_algo ${cluster_algo} \ + --cluster_threshold ${cluster_threshold} \ + --cluster_distance ${cluster_distance} \ + --minkowski_norm_p ${minkowski_norm_p} \ + --cluster_iter_limit ${cluster_iter_limit} \ + --cluster_inter ${cluster_inter} \ + --cluster_embedding ${cluster_embedding} \ + --cluster_frame_embedding ${cluster_frame_embedding} \ + --cluster_num_blocks ${cluster_num_blocks} \ + --target_frames_blocks ${target_frames_blocks} \ + --deep_cluster ${deep_cluster} \ + --freeze_clip ${freeze_clip} \ + --resume ${resume} \ + --load_from_pretrained ${load_from_pretrained} \ + --camoe_dsl ${camoe_dsl} done diff --git a/scripts/msvd.sh b/scripts/msvd.sh index 2c96287..1ee8922 100644 --- a/scripts/msvd.sh +++ b/scripts/msvd.sh @@ -70,74 +70,74 @@ for num in 22 do case ${num} in 22 ) - do_train=0 - do_eval=1 + do_train=0 + do_eval=1 resume=${HOME}/models/eclip/eclip_${dataset}_${num}/ckpt.best.pth.tar - lr=5e-3 - optim=AdamW - cluster_inter=1 - cluster_algo='kmediods++' - minkowski_norm_p=2.0 - cluster_num_blocks='49 49 49 49 49 49 49 49 49 49 49 49' - target_frames_blocks='12 12 12 12 12 12 4 4 4 4 4 4' + lr=5e-3 + optim=AdamW + cluster_inter=1 + cluster_algo='kmediods++' + minkowski_norm_p=2.0 + cluster_num_blocks='49 49 49 49 49 49 49 49 49 49 49 49' + target_frames_blocks='12 12 12 12 12 12 4 4 4 4 4 4' ;; - * ) - ;; - esac + * ) + ;; + esac model_dir=${HOME}/models/eclip/eclip_${dataset}_${num} echo "The model dir is ${model_dir}" # CUDA_LAUNCH_BLOCKING=1 python ../main.py \ - --do_train ${do_train} \ - --do_eval ${do_eval} \ - --num_thread_reader ${num_workers} \ - --epochs ${epochs} \ - --batch_size ${batch_size} \ - --n_display ${n_display} \ - --lmdb_dataset ${lmdb_dataset} \ - --train_csv ${train_csv} \ - --val_csv ${DATA_PATH}/MSRVTT_JSFUSION_test.csv \ - --data_path ${data_path} \ - --features_path ${features_path} \ - --output_dir ${model_dir} \ - --optim ${optim} \ - --lr ${lr} \ - --coef_lr ${coef_lr} \ - --wd ${wd} \ - --max_words 32 \ - --max_frames ${max_frames} \ - --batch_size_val ${batch_size_val} \ - --datatype ${dataset} \ - --expand_msrvtt_sentences \ - --feature_framerate ${fps} \ - --freeze_layer_num 0 \ - --slice_framepos 2 \ - --loose_type \ - --linear_patch 2d \ - --sim_header meanP \ - --pretrained_clip_name ${pretrained_clip_name} \ - --precision ${precision} \ - --init_method ${init_method} \ + --do_train ${do_train} \ + --do_eval ${do_eval} \ + --num_thread_reader ${num_workers} \ + --epochs ${epochs} \ + --batch_size ${batch_size} \ + --n_display ${n_display} \ + --lmdb_dataset ${lmdb_dataset} \ + --train_csv ${train_csv} \ + --val_csv ${DATA_PATH}/MSRVTT_JSFUSION_test.csv \ + --data_path ${data_path} \ + --features_path ${features_path} \ + --output_dir ${model_dir} \ + --optim ${optim} \ + --lr ${lr} \ + --coef_lr ${coef_lr} \ + --wd ${wd} \ + --max_words 32 \ + --max_frames ${max_frames} \ + --batch_size_val ${batch_size_val} \ + --datatype ${dataset} \ + --expand_msrvtt_sentences \ + --feature_framerate ${fps} \ + --freeze_layer_num 0 \ + --slice_framepos 2 \ + --loose_type \ + --linear_patch 2d \ + --sim_header meanP \ + --pretrained_clip_name ${pretrained_clip_name} \ + --precision ${precision} \ + --init_method ${init_method} \ --pretrained_dir ${pretrained_dir} \ - --cluster_algo ${cluster_algo} \ - --cluster_threshold ${cluster_threshold} \ - --cluster_distance ${cluster_distance} \ - --minkowski_norm_p ${minkowski_norm_p} \ - --cluster_iter_limit ${cluster_iter_limit} \ - --cluster_inter ${cluster_inter} \ - --cluster_embedding ${cluster_embedding} \ - --cluster_frame_embedding ${cluster_frame_embedding} \ - --cluster_num_blocks ${cluster_num_blocks} \ - --target_frames_blocks ${target_frames_blocks} \ - --deep_cluster ${deep_cluster} \ - --spectral_sigma ${spectral_sigma} \ - --spectral_graph ${spectral_graph} \ - --spectral_knn_k ${spectral_knn_k} \ - --freeze_clip ${freeze_clip} \ - --resume ${resume} \ - --load_from_pretrained ${load_from_pretrained} + --cluster_algo ${cluster_algo} \ + --cluster_threshold ${cluster_threshold} \ + --cluster_distance ${cluster_distance} \ + --minkowski_norm_p ${minkowski_norm_p} \ + --cluster_iter_limit ${cluster_iter_limit} \ + --cluster_inter ${cluster_inter} \ + --cluster_embedding ${cluster_embedding} \ + --cluster_frame_embedding ${cluster_frame_embedding} \ + --cluster_num_blocks ${cluster_num_blocks} \ + --target_frames_blocks ${target_frames_blocks} \ + --deep_cluster ${deep_cluster} \ + --spectral_sigma ${spectral_sigma} \ + --spectral_graph ${spectral_graph} \ + --spectral_knn_k ${spectral_knn_k} \ + --freeze_clip ${freeze_clip} \ + --resume ${resume} \ + --load_from_pretrained ${load_from_pretrained} done diff --git a/scripts/run_docker.sh b/scripts/run_docker.sh index 5f63259..eb9f007 100644 --- a/scripts/run_docker.sh +++ b/scripts/run_docker.sh @@ -25,7 +25,7 @@ docker_final_image="$docker_image$docker_image_tag" --shm-size 64G \ --memory-reservation 120G \ -v /home/${USER}:/home/${USER} --user=${UID}:${GID} -w ${DIR_NOW}/.. \ - -v /data1:/data1 \ + -v /data1:/data1 \ -v /etc/group:/etc/group:ro -v /etc/passwd:/etc/passwd:ro \ -p $docker_image_port:$docker_image_port $docker_final_image bash