From d36db71436d15062b6712d83603bf94322134c99 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Wed, 6 Sep 2023 17:25:51 +0800
Subject: [PATCH] [Fix] Fix TIN normalize config (#2579)

---
 configs/_base_/models/tin_r50.py        | 4 +++-
 configs/recognition/tin/README.md       | 2 +-
 configs/recognition/tin/metafile.yml    | 4 ++--
 mmaction/models/backbones/resnet_tin.py | 6 +++---
 mmaction/models/backbones/resnet_tsm.py | 6 +++++-
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py
index 4a0dbbf6a0..cc27704f07 100644
--- a/configs/_base_/models/tin_r50.py
+++ b/configs/_base_/models/tin_r50.py
@@ -1,7 +1,9 @@
 # model settings
 
 preprocess_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], format_shape='NCHW')
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    format_shape='NCHW')
 
 model = dict(
     type='Recognizer2D',
diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md
index 1e3db9145b..a250388fc3 100644
--- a/configs/recognition/tin/README.md
+++ b/configs/recognition/tin/README.md
@@ -34,7 +34,7 @@ For a long time, the vision community tries to learn the spatio-temporal represe
 
 | frame sampling strategy |   resolution   | gpus | backbone |    pretrain     | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) |          config           |          ckpt           |           log           |
 | :---------------------: | :------------: | :--: | :------: | :-------------: | :------: | :------: | :--------------: | :---------------------: | :--------: | :-----------------------: | :---------------------: | :---------------------: |
-|          1x1x8          | short-side 256 | 8x4  | ResNet50 | TSM-Kinetics400 |  71.77   |  90.36   | 8 clips x 1 crop |            x            |    6185    | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) |
+|          1x1x8          | short-side 256 | 8x4  | ResNet50 | TSM-Kinetics400 |  71.86   |  90.44   | 8 clips x 1 crop |            x            |    6185    | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) |
 
 Here, we use `finetune` to indicate that we use [TSM model](https://download.openmmlab.com/mmaction/v1.0/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth) trained on Kinetics-400 to finetune the TIN model on Kinetics-400.
 
diff --git a/configs/recognition/tin/metafile.yml b/configs/recognition/tin/metafile.yml
index 6f69c73fda..7954bd90b3 100644
--- a/configs/recognition/tin/metafile.yml
+++ b/configs/recognition/tin/metafile.yml
@@ -66,8 +66,8 @@ Models:
   Results:
   - Dataset: Kinetics-400
     Metrics:
-      Top 1 Accuracy: 71.77
-      Top 5 Accuracy: 90.36
+      Top 1 Accuracy: 71.86
+      Top 5 Accuracy: 90.44
     Task: Action Recognition
   Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log
   Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth
diff --git a/mmaction/models/backbones/resnet_tin.py b/mmaction/models/backbones/resnet_tin.py
index b8ff3659f0..0958546926 100644
--- a/mmaction/models/backbones/resnet_tin.py
+++ b/mmaction/models/backbones/resnet_tin.py
@@ -325,6 +325,9 @@ def init_structure(self):
         if len(self.non_local_cfg) != 0:
             self.make_non_local()
 
+    def _get_wrap_prefix(self):
+        return ['.net2']
+
     def make_temporal_interlace(self):
         """Make temporal interlace for some layers."""
         num_segment_list = [self.num_segments] * 4
@@ -365,6 +368,3 @@ def make_block_interlace(stage, num_segments, shift_div):
                                            self.shift_div)
         self.layer4 = make_block_interlace(self.layer4, num_segment_list[3],
                                            self.shift_div)
-
-    def init_weights(self):
-        pass
diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py
index 0079c96cb7..a2dcaf2939 100644
--- a/mmaction/models/backbones/resnet_tsm.py
+++ b/mmaction/models/backbones/resnet_tsm.py
@@ -305,6 +305,9 @@ def make_non_local(self):
                                                  self.num_segments,
                                                  self.non_local_cfg)
 
+    def _get_wrap_prefix(self):
+        return ['.net', '.block']
+
     def load_original_weights(self, logger):
         """Load weights from original checkpoint, which required converting
         keys."""
@@ -317,7 +320,7 @@ def load_original_weights(self, logger):
         for name, module in self.named_modules():
             # convert torchvision keys
             ori_name = name
-            for wrap_prefix in ['.net', '.block']:
+            for wrap_prefix in self._get_wrap_prefix():
                 if wrap_prefix in ori_name:
                     ori_name = ori_name.replace(wrap_prefix, '')
                     wrapped_layers_map[ori_name] = name
@@ -352,6 +355,7 @@ def load_original_weights(self, logger):
             if layer_name in wrapped_layers_map:
                 wrapped_name = param_name.replace(
                     layer_name, wrapped_layers_map[layer_name])
+                print(f'wrapped_name {wrapped_name}')
                 state_dict_torchvision[
                     wrapped_name] = state_dict_torchvision.pop(param_name)