From d36db71436d15062b6712d83603bf94322134c99 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Wed, 6 Sep 2023 17:25:51 +0800 Subject: [PATCH] [Fix] Fix TIN normalize config (#2579) --- configs/_base_/models/tin_r50.py | 4 +++- configs/recognition/tin/README.md | 2 +- configs/recognition/tin/metafile.yml | 4 ++-- mmaction/models/backbones/resnet_tin.py | 6 +++--- mmaction/models/backbones/resnet_tsm.py | 6 +++++- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py index 4a0dbbf6a0..cc27704f07 100644 --- a/configs/_base_/models/tin_r50.py +++ b/configs/_base_/models/tin_r50.py @@ -1,7 +1,9 @@ # model settings preprocess_cfg = dict( - mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], format_shape='NCHW') + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW') model = dict( type='Recognizer2D', diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md index 1e3db9145b..a250388fc3 100644 --- a/configs/recognition/tin/README.md +++ b/configs/recognition/tin/README.md @@ -34,7 +34,7 @@ For a long time, the vision community tries to learn the spatio-temporal represe | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | | :---------------------: | :------------: | :--: | :------: | :-------------: | :------: | :------: | :--------------: | :---------------------: | :--------: | :-----------------------: | :---------------------: | :---------------------: | -| 1x1x8 | short-side 256 | 8x4 | ResNet50 | TSM-Kinetics400 | 71.77 | 90.36 | 8 clips x 1 crop | x | 6185 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | short-side 256 | 8x4 | ResNet50 | TSM-Kinetics400 | 71.86 | 90.44 | 8 clips x 1 crop | x | 6185 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) | Here, we use `finetune` to indicate that we use [TSM model](https://download.openmmlab.com/mmaction/v1.0/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth) trained on Kinetics-400 to finetune the TIN model on Kinetics-400. diff --git a/configs/recognition/tin/metafile.yml b/configs/recognition/tin/metafile.yml index 6f69c73fda..7954bd90b3 100644 --- a/configs/recognition/tin/metafile.yml +++ b/configs/recognition/tin/metafile.yml @@ -66,8 +66,8 @@ Models: Results: - Dataset: Kinetics-400 Metrics: - Top 1 Accuracy: 71.77 - Top 5 Accuracy: 90.36 + Top 1 Accuracy: 71.86 + Top 5 Accuracy: 90.44 Task: Action Recognition Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth diff --git a/mmaction/models/backbones/resnet_tin.py b/mmaction/models/backbones/resnet_tin.py index b8ff3659f0..0958546926 100644 --- a/mmaction/models/backbones/resnet_tin.py +++ b/mmaction/models/backbones/resnet_tin.py @@ -325,6 +325,9 @@ def init_structure(self): if len(self.non_local_cfg) != 0: self.make_non_local() + def _get_wrap_prefix(self): + return ['.net2'] + def make_temporal_interlace(self): """Make temporal interlace for some layers.""" num_segment_list = [self.num_segments] * 4 @@ -365,6 +368,3 @@ def make_block_interlace(stage, num_segments, shift_div): self.shift_div) self.layer4 = make_block_interlace(self.layer4, num_segment_list[3], self.shift_div) - - def init_weights(self): - pass diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py index 0079c96cb7..a2dcaf2939 100644 --- a/mmaction/models/backbones/resnet_tsm.py +++ b/mmaction/models/backbones/resnet_tsm.py @@ -305,6 +305,9 @@ def make_non_local(self): self.num_segments, self.non_local_cfg) + def _get_wrap_prefix(self): + return ['.net', '.block'] + def load_original_weights(self, logger): """Load weights from original checkpoint, which required converting keys.""" @@ -317,7 +320,7 @@ def load_original_weights(self, logger): for name, module in self.named_modules(): # convert torchvision keys ori_name = name - for wrap_prefix in ['.net', '.block']: + for wrap_prefix in self._get_wrap_prefix(): if wrap_prefix in ori_name: ori_name = ori_name.replace(wrap_prefix, '') wrapped_layers_map[ori_name] = name @@ -352,6 +355,7 @@ def load_original_weights(self, logger): if layer_name in wrapped_layers_map: wrapped_name = param_name.replace( layer_name, wrapped_layers_map[layer_name]) + print(f'wrapped_name {wrapped_name}') state_dict_torchvision[ wrapped_name] = state_dict_torchvision.pop(param_name)