From 7996ee5b90735a6e620a741f3b2f8f2722bce7a6 Mon Sep 17 00:00:00 2001
From: oneScotch <71915686+oneScotch@users.noreply.github.com>
Date: Wed, 5 Apr 2023 06:26:19 +0200
Subject: [PATCH] [Add] CLIFF (#302)

* add cliff head

* add function to convert from crop to full camera

* add cliff annotation datasets converter

* add tramsforms to get bbox information

* store crop trans

* cliff mesh estimator

* modification to take in different resolutions

* add configs

* add missing comma

* format correction

* isort formating

* correct error in cliff_head

* revert unnecessary changes in cliff_head

* add configs(single dataset) and small modification

* configs format modification

* add test for cliff head

* format correction

* update test file

* format correction

* update test file

* format correction

* update test file

* format correction

* docformatter correction

* update test file

* format

* add README

* add README

* add test for cliff data converter

* add test for cliff mesh estimator

* update tests

* merge cliff mesh estimator to mesh estimator

* revert unnecessary tests

* format

* Revert to CliffMeshEstimator

* Fix wrong class name in test

* Fix linter

* Fix bugs for test architecture

* Fix test_data_converters.py

* Update download links

* Update pytorch3d install in workflow

* Format

* Add additional tests

* Update to ubuntu-20.04

* Update to ubuntu-20.04

* Fix pickle

* Fix setup.cfg

* Fix setup.cfg

* Change pickle5 to pickle

* Fix pandas version

---------

Co-authored-by: caizhongang <caizhongang@sensetime.com>
Co-authored-by: caizhongang <cai.zhongang@gmail.com>
---
 .github/workflows/build.yml                   |   4 +-
 .github/workflows/lint.yml                    |   2 +-
 configs/cliff/README.md                       |  81 ++
 configs/cliff/coco.py                         | 189 ++++
 configs/cliff/resnet50_pw3d_cache.py          | 225 +++++
 configs/cliff/resume.py                       | 228 +++++
 mmhuman3d/data/data_converters/__init__.py    |   3 +-
 mmhuman3d/data/data_converters/cliff.py       | 121 +++
 mmhuman3d/data/datasets/pipelines/__init__.py |  35 +-
 .../data/datasets/pipelines/transforms.py     |  40 +-
 mmhuman3d/models/architectures/builder.py     |   3 +
 .../architectures/cliff_mesh_estimator.py     | 881 ++++++++++++++++++
 mmhuman3d/models/heads/builder.py             |   2 +
 mmhuman3d/models/heads/cliff_head.py          |  98 ++
 mmhuman3d/utils/geometry.py                   |  21 +
 requirements/runtime.txt                      |   2 +-
 setup.cfg                                     |   2 +-
 tests/test_data_converters.py                 |   8 +
 tests/test_datasets/test_pipelines.py         |  15 +
 .../test_cliff_mesh_estimator.py              | 417 +++++++++
 .../test_models/test_heads/test_cliff_head.py |  59 ++
 tools/convert_datasets.py                     |   2 +-
 22 files changed, 2402 insertions(+), 36 deletions(-)
 create mode 100644 configs/cliff/README.md
 create mode 100644 configs/cliff/coco.py
 create mode 100644 configs/cliff/resnet50_pw3d_cache.py
 create mode 100644 configs/cliff/resume.py
 create mode 100644 mmhuman3d/data/data_converters/cliff.py
 create mode 100644 mmhuman3d/models/architectures/cliff_mesh_estimator.py
 create mode 100644 mmhuman3d/models/heads/cliff_head.py
 create mode 100644 tests/test_models/test_architectures/test_cliff_mesh_estimator.py
 create mode 100644 tests/test_models/test_heads/test_cliff_head.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7fd2b25f..76a50c2c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -26,7 +26,7 @@ concurrency:
 
 jobs:
   build_cuda101:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     strategy:
       matrix:
         python-version: [3.8]
@@ -69,7 +69,7 @@ jobs:
       - name: Install pytorch3d
         run: |
           conda install -c fvcore -c iopath -c conda-forge fvcore iopath -y
-          conda install pytorch3d -c pytorch3d
+          pip install "git+https://github.com/facebookresearch/pytorch3d.git"
       - name: Install MMCV
         run: |
           pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/cpu/torch${{matrix.torch}}/index.html
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 822bdcd9..2208335d 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -8,7 +8,7 @@ concurrency:
 
 jobs:
   lint:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.8
diff --git a/configs/cliff/README.md b/configs/cliff/README.md
new file mode 100644
index 00000000..8f536fa6
--- /dev/null
+++ b/configs/cliff/README.md
@@ -0,0 +1,81 @@
+# CLIFF
+
+## Introduction
+
+We provide the config files for CLIFF: [CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation](https://arxiv.org/pdf/2208.00571.pdf).
+
+```BibTeX
+
+@Inproceedings{li2022cliff,
+  author    = {Li, Zhihao and
+               Liu, Jianzhuang and
+               Zhang, Zhensong and
+               Xu, Songcen and
+               Yan, Youliang},
+  title     = {CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation},
+  booktitle = {ECCV},
+  year      = {2022}
+}
+
+```
+
+## Notes
+
+- [SMPL](https://smpl.is.tue.mpg.de/) v1.0 is used in our experiments.
+- [J_regressor_extra.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/J_regressor_extra.npy?versionId=CAEQHhiBgIDD6c3V6xciIGIwZDEzYWI5NTBlOTRkODU4OTE1M2Y4YTI0NTVlZGM1)
+- [J_regressor_h36m.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/J_regressor_h36m.npy?versionId=CAEQHhiBgIDE6c3V6xciIDdjYzE3MzQ4MmU4MzQyNmRiZDA5YTg2YTI5YWFkNjRi)
+- [pascal_occluders.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/pascal_occluders.npy?versionId=CAEQOhiBgMCH2fqigxgiIDY0YzRiNThkMjU1MzRjZTliMTBhZmFmYWY0MTViMTIx)
+- [resnet50_a1h2_176-001a1197.pth](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1h2_176-001a1197.pth)
+- [resnet50_a1h2_176-001a1197.pth(alternative download link)](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/resnet50_a1h2_176-001a1197.pth)
+
+Download the above resources and arrange them in the following file structure:
+
+```text
+mmhuman3d
+├── mmhuman3d
+├── docs
+├── tests
+├── tools
+├── configs
+└── data
+    ├── checkpoints
+    │   ├── resnet50_a1h2_176-001a1197.pth
+    ├── body_models
+    │   ├── J_regressor_extra.npy
+    │   ├── J_regressor_h36m.npy
+    │   ├── smpl_mean_params.npz
+    │   └── smpl
+    │       ├── SMPL_FEMALE.pkl
+    │       ├── SMPL_MALE.pkl
+    │       └── SMPL_NEUTRAL.pkl
+    ├── preprocessed_datasets
+    │   ├── cliff_coco_train.npz
+    │   ├── cliff_mpii_train.npz
+    │   ├── h36m_mosh_train.npz
+    │   ├── muco3dhp_train.npz
+    │   ├── mpi_inf_3dhp_train.npz
+    │   └── pw3d_test.npz
+    ├── occluders
+    │   ├── pascal_occluders.npy
+    └── datasets
+        ├── coco
+        ├── h36m
+        ├── muco
+        ├── mpi_inf_3dhp
+        ├── mpii
+        └── pw3d
+```
+
+## Training
+Stage 1: First use [resnet50_pw3d_cache.py](resnet50_pw3d_cache.py) to train.
+
+Stage 2: After around 150 epoches, switch to [resume.py](resume.py) by using "--resume-from" optional argument.
+
+## Results and Models
+
+We evaluate HMR on 3DPW. Values are MPJPE/PA-MPJPE.
+
+|                          Config                           |     3DPW      | Download |
+|:---------------------------------------------------------:|:-------------:|:------:|
+| Stage 1: [resnet50_pw3d_cache.py](resnet50_pw3d_cache.py) | 48.65 / 76.49 | [model](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/resnet50_cliff-8328e2e2_20230327.pth) &#124; [log](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/20220909_142945.log)
+| Stage 2: [resnet50_pw3d_cache.py](resnet50_pw3d_cache.py) | 47.38 / 75.08 | [model](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/resnet50_cliff_new-1e639f1d_20230327.pth) &#124; [log](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/20230222_092227.log)
diff --git a/configs/cliff/coco.py b/configs/cliff/coco.py
new file mode 100644
index 00000000..6651909e
--- /dev/null
+++ b/configs/cliff/coco.py
@@ -0,0 +1,189 @@
+_base_ = ['../_base_/default_runtime.py']
+use_adversarial_train = True
+
+# evaluate
+evaluation = dict(metric=['pa-mpjpe', 'mpjpe'])
+# optimizer
+optimizer = dict(
+    backbone=dict(type='Adam', lr=1e-4),
+    head=dict(type='Adam', lr=1e-4),
+    # disc=dict(type='Adam', lr=1e-4)
+)
+optimizer_config = dict(grad_clip=2.0)
+# learning policy
+lr_config = dict(policy='Fixed', by_epoch=False)
+runner = dict(type='EpochBasedRunner', max_epochs=800)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+img_resolution = (192, 256)
+
+# model settings
+model = dict(
+    type='CliffImageBodyModelEstimator',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[3],
+        norm_eval=False,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='data/checkpoints/resnet50_a1h2_176-001a1197.pth')),
+    head=dict(
+        type='CliffHead',
+        feat_dim=2048,
+        smpl_mean_params='data/body_models/smpl_mean_params.npz'),
+    body_model_train=dict(
+        type='SMPL',
+        keypoint_src='smpl_54',
+        keypoint_dst='smpl_54',
+        model_path='data/body_models/smpl',
+        keypoint_approximate=True,
+        extra_joints_regressor='data/body_models/J_regressor_extra.npy'),
+    body_model_test=dict(
+        type='SMPL',
+        keypoint_src='h36m',
+        keypoint_dst='h36m',
+        model_path='data/body_models/smpl',
+        joints_regressor='data/body_models/J_regressor_h36m.npy'),
+    convention='smpl_54',
+    loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100),
+    loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10),
+    loss_vertex=dict(type='L1Loss', loss_weight=2),
+    loss_smpl_pose=dict(type='MSELoss', loss_weight=3),
+    loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02),
+    loss_adv=dict(
+        type='GANLoss',
+        gan_type='lsgan',
+        real_label_val=1.0,
+        fake_label_val=0.0,
+        loss_weight=1),
+    # disc=dict(type='SMPLDiscriminator')
+)
+# dataset settings
+dataset_type = 'HumanImageDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data_keys = [
+    'has_smpl',
+    'smpl_body_pose',
+    'smpl_global_orient',
+    'smpl_betas',
+    'smpl_transl',
+    'keypoints2d',
+    'keypoints3d',
+    'sample_idx',
+    'img_h',  # extras for cliff
+    'img_w',
+    'focal_length',
+    'center',
+    'scale',
+    'bbox_info',
+    'crop_trans',
+    'inv_trans'
+]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomChannelNoise', noise_factor=0.4),
+    dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_54'),
+    dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25),
+    dict(type='GetBboxInfo'),
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=data_keys),
+    dict(
+        type='Collect',
+        keys=['img', *data_keys],
+        meta_keys=['image_path', 'center', 'scale', 'rotation'])
+]
+adv_data_keys = [
+    'smpl_body_pose', 'smpl_global_orient', 'smpl_betas', 'smpl_transl'
+]
+train_adv_pipeline = [dict(type='Collect', keys=adv_data_keys, meta_keys=[])]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0),
+    dict(type='GetBboxInfo'),
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=data_keys),
+    dict(
+        type='Collect',
+        keys=['img', *data_keys],
+        meta_keys=[
+            'image_path', 'center', 'scale', 'rotation', 'img_h', 'img_w',
+            'bbox_info'
+        ])
+]
+
+inference_pipeline = [
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(
+        type='Collect',
+        keys=['img', 'sample_idx'],
+        meta_keys=['image_path', 'center', 'scale', 'rotation'])
+]
+
+cache_files = {
+    'cliff_coco': 'data/cache/cliff_coco_train_smpl_54.npz',
+}
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    train=dict(
+        type='AdversarialDataset',
+        train_dataset=dict(
+            type='MixedDataset',
+            configs=[
+                dict(
+                    type=dataset_type,
+                    dataset_name='coco',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['cliff_coco'],
+                    ann_file='cliff_coco_train.npz'),
+            ],
+            partition=[1.0],
+        ),
+        adv_dataset=dict(
+            type='MeshDataset',
+            dataset_name='cmu_mosh',
+            data_prefix='data',
+            pipeline=train_adv_pipeline,
+            ann_file='cmu_mosh.npz')),
+    val=dict(
+        type=dataset_type,
+        body_model=dict(
+            type='GenderedSMPL',
+            keypoint_src='h36m',
+            keypoint_dst='h36m',
+            model_path='data/body_models/smpl',
+            joints_regressor='data/body_models/J_regressor_h36m.npy'),
+        dataset_name='pw3d',
+        data_prefix='data',
+        pipeline=test_pipeline,
+        ann_file='pw3d_test.npz'),
+    test=dict(
+        type=dataset_type,
+        body_model=dict(
+            type='GenderedSMPL',
+            keypoint_src='h36m',
+            keypoint_dst='h36m',
+            model_path='data/body_models/smpl',
+            joints_regressor='data/body_models/J_regressor_h36m.npy'),
+        dataset_name='pw3d',
+        data_prefix='data',
+        pipeline=test_pipeline,
+        ann_file='pw3d_test.npz'),
+)
diff --git a/configs/cliff/resnet50_pw3d_cache.py b/configs/cliff/resnet50_pw3d_cache.py
new file mode 100644
index 00000000..d0d8becd
--- /dev/null
+++ b/configs/cliff/resnet50_pw3d_cache.py
@@ -0,0 +1,225 @@
+_base_ = ['../_base_/default_runtime.py']
+use_adversarial_train = True
+
+# evaluate
+evaluation = dict(metric=['pa-mpjpe', 'mpjpe'])
+# optimizer
+optimizer = dict(
+    backbone=dict(type='Adam', lr=3e-4),
+    head=dict(type='Adam', lr=3e-4),
+    # disc=dict(type='Adam', lr=1e-4)
+)
+optimizer_config = dict(grad_clip=2.0)
+# learning policy
+lr_config = dict(policy='step', gamma=0.1, step=[100])
+runner = dict(type='EpochBasedRunner', max_epochs=250)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+img_resolution = (192, 256)
+
+# model settings
+model = dict(
+    type='CliffImageBodyModelEstimator',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[3],
+        norm_eval=False,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='data/checkpoints/resnet50_a1h2_176-001a1197.pth')),
+    head=dict(
+        type='CliffHead',
+        feat_dim=2048,
+        smpl_mean_params='data/body_models/smpl_mean_params.npz'),
+    body_model_train=dict(
+        type='SMPL',
+        keypoint_src='smpl_54',
+        keypoint_dst='smpl_54',
+        model_path='data/body_models/smpl',
+        keypoint_approximate=True,
+        extra_joints_regressor='data/body_models/J_regressor_extra.npy'),
+    body_model_test=dict(
+        type='SMPL',
+        keypoint_src='h36m',
+        keypoint_dst='h36m',
+        model_path='data/body_models/smpl',
+        joints_regressor='data/body_models/J_regressor_h36m.npy'),
+    convention='smpl_54',
+    loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100),
+    loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10),
+    loss_vertex=dict(type='L1Loss', loss_weight=2),
+    loss_smpl_pose=dict(type='MSELoss', loss_weight=3),
+    loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02),
+    loss_adv=dict(
+        type='GANLoss',
+        gan_type='lsgan',
+        real_label_val=1.0,
+        fake_label_val=0.0,
+        loss_weight=1),
+    # disc=dict(type='SMPLDiscriminator')
+)
+# dataset settings
+dataset_type = 'HumanImageDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data_keys = [
+    'has_smpl',
+    'smpl_body_pose',
+    'smpl_global_orient',
+    'smpl_betas',
+    'smpl_transl',
+    'keypoints2d',
+    'keypoints3d',
+    'sample_idx',
+    'img_h',  # extras for cliff
+    'img_w',
+    'focal_length',
+    'center',
+    'scale',
+    'bbox_info',
+    'crop_trans',
+    'inv_trans'
+]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomChannelNoise', noise_factor=0.4),
+    dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_54'),
+    dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25),
+    dict(type='GetBboxInfo'),
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=data_keys),
+    dict(
+        type='Collect',
+        keys=['img', *data_keys],
+        meta_keys=['image_path', 'center', 'scale', 'rotation'])
+]
+adv_data_keys = [
+    'smpl_body_pose', 'smpl_global_orient', 'smpl_betas', 'smpl_transl'
+]
+train_adv_pipeline = [dict(type='Collect', keys=adv_data_keys, meta_keys=[])]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0),
+    dict(type='GetBboxInfo'),
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=data_keys),
+    dict(
+        type='Collect',
+        keys=['img', *data_keys],
+        meta_keys=[
+            'image_path', 'center', 'scale', 'rotation', 'img_h', 'img_w',
+            'bbox_info'
+        ])
+]
+
+inference_pipeline = [
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(
+        type='Collect',
+        keys=['img', 'sample_idx'],
+        meta_keys=['image_path', 'center', 'scale', 'rotation'])
+]
+
+cache_files = {
+    'h36m': 'data/cache/h36m_mosh_train_smpl_54.npz',
+    'mpi_inf_3dhp': 'data/cache/mpi_inf_3dhp_train_smpl_54.npz',
+    'cliff_coco': 'data/cache/cliff_coco_train_smpl_54.npz',
+    'cliff_mpii': 'data/cache/cliff_mpii_train_smpl_54.npz',
+    'pw3d': 'data/cache/pw3d_train_smpl_54.npz',
+}
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    train=dict(
+        type='AdversarialDataset',
+        train_dataset=dict(
+            type='MixedDataset',
+            configs=[
+                dict(
+                    type=dataset_type,
+                    dataset_name='h36m',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['h36m'],
+                    ann_file='h36m_mosh_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='mpi_inf_3dhp',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['mpi_inf_3dhp'],
+                    ann_file='mpi_inf_3dhp_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='mpii',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['cliff_mpii'],
+                    ann_file='cliff_mpii_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='coco',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['cliff_coco'],
+                    ann_file='cliff_coco_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='pw3d',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['pw3d'],
+                    ann_file='pw3d_train.npz'),
+            ],
+            partition=[0.4, 0.1, 0.1, 0.2, 0.2],
+        ),
+        adv_dataset=dict(
+            type='MeshDataset',
+            dataset_name='cmu_mosh',
+            data_prefix='data',
+            pipeline=train_adv_pipeline,
+            ann_file='cmu_mosh.npz')),
+    val=dict(
+        type=dataset_type,
+        body_model=dict(
+            type='GenderedSMPL',
+            keypoint_src='h36m',
+            keypoint_dst='h36m',
+            model_path='data/body_models/smpl',
+            joints_regressor='data/body_models/J_regressor_h36m.npy'),
+        dataset_name='pw3d',
+        data_prefix='data',
+        pipeline=test_pipeline,
+        ann_file='pw3d_test.npz'),
+    test=dict(
+        type=dataset_type,
+        body_model=dict(
+            type='GenderedSMPL',
+            keypoint_src='h36m',
+            keypoint_dst='h36m',
+            model_path='data/body_models/smpl',
+            joints_regressor='data/body_models/J_regressor_h36m.npy'),
+        dataset_name='pw3d',
+        data_prefix='data',
+        pipeline=test_pipeline,
+        ann_file='pw3d_test.npz'),
+)
diff --git a/configs/cliff/resume.py b/configs/cliff/resume.py
new file mode 100644
index 00000000..652de149
--- /dev/null
+++ b/configs/cliff/resume.py
@@ -0,0 +1,228 @@
+_base_ = ['../_base_/default_runtime.py']
+use_adversarial_train = True
+
+# evaluate
+evaluation = dict(metric=['pa-mpjpe', 'mpjpe'])
+# optimizer
+optimizer = dict(
+    backbone=dict(type='Adam', lr=3e-4),
+    head=dict(type='Adam', lr=3e-4),
+    # disc=dict(type='Adam', lr=1e-4)
+)
+optimizer_config = dict(grad_clip=2.0)
+# learning policy
+lr_config = dict(policy='step', gamma=0.1, step=[100])
+runner = dict(type='EpochBasedRunner', max_epochs=160)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+img_resolution = (192, 256)
+
+# model settings
+model = dict(
+    type='CliffImageBodyModelEstimator',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[3],
+        norm_eval=False,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='data/checkpoints/resnet50_a1h2_176-001a1197.pth')),
+    head=dict(
+        type='CliffHead',
+        feat_dim=2048,
+        smpl_mean_params='data/body_models/smpl_mean_params.npz'),
+    body_model_train=dict(
+        type='SMPL',
+        keypoint_src='smpl_54',
+        keypoint_dst='smpl_54',
+        model_path='data/body_models/smpl',
+        keypoint_approximate=True,
+        extra_joints_regressor='data/body_models/J_regressor_extra.npy'),
+    body_model_test=dict(
+        type='SMPL',
+        keypoint_src='h36m',
+        keypoint_dst='h36m',
+        model_path='data/body_models/smpl',
+        joints_regressor='data/body_models/J_regressor_h36m.npy'),
+    convention='smpl_54',
+    loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100),
+    loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10),
+    loss_vertex=dict(type='L1Loss', loss_weight=2),
+    loss_smpl_pose=dict(type='MSELoss', loss_weight=3),
+    loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02),
+    loss_adv=dict(
+        type='GANLoss',
+        gan_type='lsgan',
+        real_label_val=1.0,
+        fake_label_val=0.0,
+        loss_weight=1),
+    # disc=dict(type='SMPLDiscriminator')
+)
+# dataset settings
+dataset_type = 'HumanImageDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data_keys = [
+    'has_smpl',
+    'smpl_body_pose',
+    'smpl_global_orient',
+    'smpl_betas',
+    'smpl_transl',
+    'keypoints2d',
+    'keypoints3d',
+    'sample_idx',
+    'img_h',  # extras for cliff
+    'img_w',
+    'focal_length',
+    'center',
+    'scale',
+    'bbox_info',
+    'crop_trans',
+    'inv_trans'
+]
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomChannelNoise', noise_factor=0.4),
+    dict(
+        type='SyntheticOcclusion',
+        occluders_file='data/occluders/pascal_occluders.npy'),
+    dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_54'),
+    dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25),
+    dict(type='GetBboxInfo'),
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=data_keys),
+    dict(
+        type='Collect',
+        keys=['img', *data_keys],
+        meta_keys=['image_path', 'center', 'scale', 'rotation'])
+]
+adv_data_keys = [
+    'smpl_body_pose', 'smpl_global_orient', 'smpl_betas', 'smpl_transl'
+]
+train_adv_pipeline = [dict(type='Collect', keys=adv_data_keys, meta_keys=[])]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0),
+    dict(type='GetBboxInfo'),
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=data_keys),
+    dict(
+        type='Collect',
+        keys=['img', *data_keys],
+        meta_keys=[
+            'image_path', 'center', 'scale', 'rotation', 'img_h', 'img_w',
+            'bbox_info'
+        ])
+]
+
+inference_pipeline = [
+    dict(type='MeshAffine', img_res=img_resolution),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(
+        type='Collect',
+        keys=['img', 'sample_idx'],
+        meta_keys=['image_path', 'center', 'scale', 'rotation'])
+]
+
+cache_files = {
+    'h36m': 'data/cache/h36m_mosh_train_smpl_54.npz',
+    'muco': 'data/cache/muco3dhp_train.npz',
+    'cliff_coco': 'data/cache/cliff_coco_train_smpl_54.npz',
+    'cliff_mpii': 'data/cache/cliff_mpii_train_smpl_54.npz',
+    'pw3d': 'data/cache/pw3d_train_smpl_54.npz',
+}
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    train=dict(
+        type='AdversarialDataset',
+        train_dataset=dict(
+            type='MixedDataset',
+            configs=[
+                dict(
+                    type=dataset_type,
+                    dataset_name='h36m',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['h36m'],
+                    ann_file='h36m_mosh_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='muco',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['muco'],
+                    ann_file='muco3dhp_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='mpii',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['cliff_mpii'],
+                    ann_file='cliff_mpii_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='coco',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['cliff_coco'],
+                    ann_file='cliff_coco_train.npz'),
+                dict(
+                    type=dataset_type,
+                    dataset_name='pw3d',
+                    data_prefix='data',
+                    pipeline=train_pipeline,
+                    convention='smpl_54',
+                    cache_data_path=cache_files['pw3d'],
+                    ann_file='pw3d_train.npz'),
+            ],
+            partition=[0.4, 0.1, 0.1, 0.2, 0.2],
+        ),
+        adv_dataset=dict(
+            type='MeshDataset',
+            dataset_name='cmu_mosh',
+            data_prefix='data',
+            pipeline=train_adv_pipeline,
+            ann_file='cmu_mosh.npz')),
+    val=dict(
+        type=dataset_type,
+        body_model=dict(
+            type='GenderedSMPL',
+            keypoint_src='h36m',
+            keypoint_dst='h36m',
+            model_path='data/body_models/smpl',
+            joints_regressor='data/body_models/J_regressor_h36m.npy'),
+        dataset_name='pw3d',
+        data_prefix='data',
+        pipeline=test_pipeline,
+        ann_file='pw3d_test.npz'),
+    test=dict(
+        type=dataset_type,
+        body_model=dict(
+            type='GenderedSMPL',
+            keypoint_src='h36m',
+            keypoint_dst='h36m',
+            model_path='data/body_models/smpl',
+            joints_regressor='data/body_models/J_regressor_h36m.npy'),
+        dataset_name='pw3d',
+        data_prefix='data',
+        pipeline=test_pipeline,
+        ann_file='pw3d_test.npz'),
+)
diff --git a/mmhuman3d/data/data_converters/__init__.py b/mmhuman3d/data/data_converters/__init__.py
index c056f926..7471bfa4 100644
--- a/mmhuman3d/data/data_converters/__init__.py
+++ b/mmhuman3d/data/data_converters/__init__.py
@@ -1,6 +1,7 @@
 from .agora import AgoraConverter
 from .amass import AmassConverter
 from .builder import build_data_converter
+from .cliff import CliffConverter
 from .coco import CocoConverter
 from .coco_hybrik import CocoHybrIKConverter
 from .coco_wholebody import CocoWholebodyConverter
@@ -43,5 +44,5 @@
     'SurrealConverter', 'InstaVibeConverter', 'SpinConverter', 'VibeConverter',
     'HuMManConverter', 'FFHQFlameConverter', 'ExposeCuratedFitsConverter',
     'ExposeSPINSMPLXConverter', 'FreihandConverter', 'StirlingConverter',
-    'EHFConverter'
+    'EHFConverter', 'CliffConverter'
 ]
diff --git a/mmhuman3d/data/data_converters/cliff.py b/mmhuman3d/data/data_converters/cliff.py
new file mode 100644
index 00000000..e34e897f
--- /dev/null
+++ b/mmhuman3d/data/data_converters/cliff.py
@@ -0,0 +1,121 @@
+import os
+from typing import List
+
+import numpy as np
+
+from mmhuman3d.core.conventions.keypoints_mapping import convert_kps
+from mmhuman3d.data.data_structures.human_data import HumanData
+from mmhuman3d.data.data_structures.multi_human_data import MultiHumanData
+from .base_converter import BaseModeConverter
+from .builder import DATA_CONVERTERS
+
+
+@DATA_CONVERTERS.register_module()
+class CliffConverter(BaseModeConverter):
+    """CLIFF datasets converter `Carrying Location Information in Full Frames
+    into Human Pose and Shape Estimation' More details can be found in the
+    `paper.
+
+    <https://arxiv.org/pdf/2208.00571.pdf>`__.
+    Args:
+        modes (list):  'coco', 'mpii'
+        for accepted modes
+    """
+
+    ACCEPTED_MODES = ['coco', 'mpii']
+
+    def __init__(self, modes: List = []) -> None:
+        super(CliffConverter, self).__init__(modes)
+
+        # def __init__(self) -> None:
+        self.mapping_dict = {
+            'coco': 'coco2014part_cliffGT.npz',
+            'mpii': 'mpii_cliffGT.npz',
+        }
+
+    def convert_by_mode(self,
+                        dataset_path: str,
+                        out_path: str,
+                        mode: str,
+                        enable_multi_human_data: bool = False) -> dict:
+        """
+        Args:
+            dataset_path (str): Path to directory where spin preprocessed
+            npz files are stored
+            out_path (str): Path to directory to save preprocessed npz file
+            mode (str): Mode in accepted modes
+            enable_multi_human_data (bool):
+                Whether to generate a multi-human data. If set to True,
+                stored in MultiHumanData() format.
+                Default: False, stored in HumanData() format.
+
+        Returns:
+            dict:
+                A dict containing keys image_path, bbox_xywh, keypoints2d,
+                keypoints2d_mask,stored in HumanData() format. keypoints3d,
+                keypoints3d_mask, smpl are added if available.
+
+        """
+        if enable_multi_human_data:
+            # use MultiHumanData to store all data
+            human_data = MultiHumanData()
+        else:
+            # use HumanData to store all data
+            human_data = HumanData()
+
+        image_path_, keypoints2d_, bbox_xywh_ = [], [], []
+
+        if mode in self.mapping_dict.keys():
+            seq_file = self.mapping_dict[mode]
+            seq_path = os.path.join(dataset_path, seq_file)
+
+        data = np.load(seq_path)
+
+        keypoints2d_ = data['part']
+        image_path_ = data['imgname']
+
+        # center scale to bbox
+        w = h = data['scale'] * 200
+        x = data['center'][:, 0] - w / 2
+        y = data['center'][:, 1] - h / 2
+
+        bbox_xywh_ = np.column_stack((x, y, w, h))
+
+        # convert keypoints
+        bbox_xywh_ = np.array(bbox_xywh_).reshape((-1, 4))
+        bbox_xywh_ = np.hstack([bbox_xywh_, np.ones([bbox_xywh_.shape[0], 1])])
+        keypoints2d_ = np.array(keypoints2d_).reshape((-1, 24, 3))
+        keypoints2d_, keypoints2d_mask = convert_kps(keypoints2d_, 'smpl_24',
+                                                     'human_data')
+
+        if 'S' in data:
+            keypoints3d_ = data['S']
+            keypoints3d_ = np.array(keypoints3d_).reshape((-1, 24, 4))
+            keypoints3d_, keypoints3d_mask = convert_kps(
+                keypoints3d_, 'smpl_24', 'human_data')
+            human_data['keypoints3d_mask'] = keypoints3d_mask
+            human_data['keypoints3d'] = keypoints3d_
+
+        if 'has_smpl' in data:
+            has_smpl = data['has_smpl']
+            smpl = {}
+            smpl['body_pose'] = np.array(data['pose'][:, 3:]).reshape(
+                (-1, 23, 3))
+            smpl['global_orient'] = np.array(data['pose'][:, :3]).reshape(
+                (-1, 3))
+            smpl['betas'] = np.array(data['shape']).reshape((-1, 10))
+            human_data['smpl'] = smpl
+            human_data['has_smpl'] = has_smpl
+
+        human_data['image_path'] = image_path_.tolist()
+        human_data['bbox_xywh'] = bbox_xywh_
+        human_data['keypoints2d_mask'] = keypoints2d_mask
+        human_data['keypoints2d'] = keypoints2d_
+        human_data['config'] = mode
+        human_data.compress_keypoints_by_mask()
+
+        # store the data struct
+        if not os.path.isdir(out_path):
+            os.makedirs(out_path)
+        out_file = os.path.join(out_path, f'cliff_{mode}_train.npz')
+        human_data.dump(out_file)
diff --git a/mmhuman3d/data/datasets/pipelines/__init__.py b/mmhuman3d/data/datasets/pipelines/__init__.py
index ae6d9dbe..6551ed63 100644
--- a/mmhuman3d/data/datasets/pipelines/__init__.py
+++ b/mmhuman3d/data/datasets/pipelines/__init__.py
@@ -22,6 +22,7 @@
     BBoxCenterJitter,
     CenterCrop,
     ColorJitter,
+    GetBboxInfo,
     GetRandomScaleRotation,
     Lighting,
     MeshAffine,
@@ -33,31 +34,11 @@
 )
 
 __all__ = [
-    'Compose',
-    'to_tensor',
-    'ToTensor',
-    'ImageToTensor',
-    'ToPIL',
-    'ToNumpy',
-    'Transpose',
-    'Collect',
-    'LoadImageFromFile',
-    'CenterCrop',
-    'RandomHorizontalFlip',
-    'ColorJitter',
-    'Lighting',
-    'RandomChannelNoise',
-    'GetRandomScaleRotation',
-    'MeshAffine',
-    'HybrIKRandomFlip',
-    'HybrIKAffine',
-    'GenerateHybrIKTarget',
-    'RandomDPG',
-    'RandomOcclusion',
-    'Rotation',
-    'NewKeypointsSelection',
-    'Normalize',
-    'SyntheticOcclusion',
-    'BBoxCenterJitter',
-    'SimulateLowRes',
+    'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToPIL', 'ToNumpy',
+    'Transpose', 'Collect', 'LoadImageFromFile', 'CenterCrop',
+    'RandomHorizontalFlip', 'ColorJitter', 'Lighting', 'RandomChannelNoise',
+    'GetRandomScaleRotation', 'MeshAffine', 'HybrIKRandomFlip', 'HybrIKAffine',
+    'GenerateHybrIKTarget', 'RandomDPG', 'RandomOcclusion', 'Rotation',
+    'NewKeypointsSelection', 'Normalize', 'SyntheticOcclusion',
+    'BBoxCenterJitter', 'SimulateLowRes', 'GetBboxInfo'
 ]
diff --git a/mmhuman3d/data/datasets/pipelines/transforms.py b/mmhuman3d/data/datasets/pipelines/transforms.py
index 078bcd38..da2f42f9 100644
--- a/mmhuman3d/data/datasets/pipelines/transforms.py
+++ b/mmhuman3d/data/datasets/pipelines/transforms.py
@@ -746,14 +746,18 @@ class MeshAffine:
     """
 
     def __init__(self, img_res):
-        self.img_res = img_res
-        self.image_size = np.array([img_res, img_res])
+        if isinstance(img_res, tuple):
+            self.image_size = img_res
+        else:
+            self.image_size = np.array([img_res, img_res])
 
     def __call__(self, results):
         c = results['center']
         s = results['scale']
         r = results['rotation']
         trans = get_affine_transform(c, s, r, self.image_size)
+        inv_trans = get_affine_transform(c, s, 0., self.image_size, inv=True)
+        crop_trans = get_affine_transform(c, s, 0., self.image_size)
 
         if 'img' in results:
             img = results['img']
@@ -797,6 +801,8 @@ def __call__(self, results):
             global_orient = _rotate_smpl_pose(global_orient, r)
             results['smplx_global_orient'] = global_orient
 
+        results['crop_trans'] = crop_trans
+        results['inv_trans'] = inv_trans
         return results
 
 
@@ -951,3 +957,33 @@ def __call__(self, results):
         results['img'] = img
 
         return results
+
+
+@PIPELINES.register_module()
+class GetBboxInfo:
+    """Get bbox for cliff."""
+
+    def estimate_focal_length(self, img_h, img_w):
+        return (img_w * img_w + img_h * img_h)**0.5  # fov: 55 degree
+
+    def __call__(self, results):
+        """(1) Get focal length from original image (2) get bbox_info from c
+        and s."""
+        img = results['img']
+        img_h, img_w = img.shape[:2]
+        focal_length = self.estimate_focal_length(img_h, img_w)
+
+        results['img_h'] = img_h
+        results['img_w'] = img_w
+        results['focal_length'] = focal_length
+        cx, cy = results['center']
+        s = results['scale'][0]
+
+        bbox_info = np.stack([cx - img_w / 2., cy - img_h / 2., s])
+        bbox_info[:2] = bbox_info[:2] / focal_length * 2.8  # [-1, 1]
+        bbox_info[2] = (bbox_info[2] - 0.24 * focal_length) / (
+            0.06 * focal_length)  # [-1, 1]
+
+        results['bbox_info'] = np.float32(bbox_info)
+
+        return results
diff --git a/mmhuman3d/models/architectures/builder.py b/mmhuman3d/models/architectures/builder.py
index 4e504d82..15ffbc01 100644
--- a/mmhuman3d/models/architectures/builder.py
+++ b/mmhuman3d/models/architectures/builder.py
@@ -3,6 +3,7 @@
 from mmcv.cnn import MODELS as MMCV_MODELS
 from mmcv.utils import Registry
 
+from .cliff_mesh_estimator import CliffImageBodyModelEstimator
 from .expressive_mesh_estimator import SMPLXImageBodyModelEstimator
 from .hybrik import HybrIK_trainer
 from .mesh_estimator import ImageBodyModelEstimator, VideoBodyModelEstimator
@@ -25,6 +26,8 @@ def build_from_cfg(cfg, registry, default_args=None):
     name='VideoBodyModelEstimator', module=VideoBodyModelEstimator)
 ARCHITECTURES.register_module(
     name='SMPLXImageBodyModelEstimator', module=SMPLXImageBodyModelEstimator)
+ARCHITECTURES.register_module(
+    name='CliffImageBodyModelEstimator', module=CliffImageBodyModelEstimator)
 ARCHITECTURES.register_module(name='PyMAFX', module=PyMAFX)
 
 
diff --git a/mmhuman3d/models/architectures/cliff_mesh_estimator.py b/mmhuman3d/models/architectures/cliff_mesh_estimator.py
new file mode 100644
index 00000000..cea36317
--- /dev/null
+++ b/mmhuman3d/models/architectures/cliff_mesh_estimator.py
@@ -0,0 +1,881 @@
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+import mmhuman3d.core.visualization.visualize_smpl as visualize_smpl
+from mmhuman3d.core.conventions.keypoints_mapping import get_keypoint_idx
+from mmhuman3d.models.utils import FitsDict
+from mmhuman3d.utils.geometry import (
+    batch_rodrigues,
+    cam_crop2full,
+    estimate_translation,
+    perspective_projection,
+    project_points,
+    rotation_matrix_to_angle_axis,
+)
+from ..backbones.builder import build_backbone
+from ..body_models.builder import build_body_model
+from ..discriminators.builder import build_discriminator
+from ..heads.builder import build_head
+from ..losses.builder import build_loss
+from ..necks.builder import build_neck
+from ..registrants.builder import build_registrant
+from .base_architecture import BaseArchitecture
+
+
+def set_requires_grad(nets, requires_grad=False):
+    """Set requies_grad for all the networks.
+
+    Args:
+        nets (nn.Module | list[nn.Module]): A list of networks or a single
+            network.
+        requires_grad (bool): Whether the networks require gradients or not
+    """
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+class BodyModelEstimator(BaseArchitecture, metaclass=ABCMeta):
+    """BodyModelEstimator Architecture.
+
+    Args:
+        backbone (dict | None, optional): Backbone config dict. Default: None.
+        neck (dict | None, optional): Neck config dict. Default: None
+        head (dict | None, optional): Regressor config dict. Default: None.
+        disc (dict | None, optional): Discriminator config dict.
+            Default: None.
+        registration (dict | None, optional): Registration config dict.
+            Default: None.
+        body_model_train (dict | None, optional): SMPL config dict during
+            training. Default: None.
+        body_model_test (dict | None, optional): SMPL config dict during
+            test. Default: None.
+        convention (str, optional): Keypoints convention. Default: "human_data"
+        loss_keypoints2d (dict | None, optional): Losses config dict for
+            2D keypoints. Default: None.
+        loss_keypoints3d (dict | None, optional): Losses config dict for
+            3D keypoints. Default: None.
+        loss_vertex (dict | None, optional): Losses config dict for mesh
+            vertices. Default: None
+        loss_smpl_pose (dict | None, optional): Losses config dict for smpl
+            pose. Default: None
+        loss_smpl_betas (dict | None, optional): Losses config dict for smpl
+            betas. Default: None
+        loss_camera (dict | None, optional): Losses config dict for predicted
+            camera parameters. Default: None
+        loss_adv (dict | None, optional): Losses config for adversial
+            training. Default: None.
+        loss_segm_mask (dict | None, optional): Losses config for predicted
+        part segmentation. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 backbone: Optional[Union[dict, None]] = None,
+                 neck: Optional[Union[dict, None]] = None,
+                 head: Optional[Union[dict, None]] = None,
+                 disc: Optional[Union[dict, None]] = None,
+                 registration: Optional[Union[dict, None]] = None,
+                 body_model_train: Optional[Union[dict, None]] = None,
+                 body_model_test: Optional[Union[dict, None]] = None,
+                 convention: Optional[str] = 'human_data',
+                 loss_keypoints2d: Optional[Union[dict, None]] = None,
+                 loss_keypoints3d: Optional[Union[dict, None]] = None,
+                 loss_vertex: Optional[Union[dict, None]] = None,
+                 loss_smpl_pose: Optional[Union[dict, None]] = None,
+                 loss_smpl_betas: Optional[Union[dict, None]] = None,
+                 loss_camera: Optional[Union[dict, None]] = None,
+                 loss_adv: Optional[Union[dict, None]] = None,
+                 loss_segm_mask: Optional[Union[dict, None]] = None,
+                 init_cfg: Optional[Union[list, dict, None]] = None):
+        super(BodyModelEstimator, self).__init__(init_cfg)
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck)
+        self.head = build_head(head)
+        self.disc = build_discriminator(disc)
+
+        self.body_model_train = build_body_model(body_model_train)
+        self.body_model_test = build_body_model(body_model_test)
+        self.convention = convention
+
+        # TODO: support HMR+
+
+        self.registration = registration
+        if registration is not None:
+            self.fits_dict = FitsDict(fits='static')
+            self.registration_mode = self.registration['mode']
+            self.registrant = build_registrant(registration['registrant'])
+        else:
+            self.registrant = None
+
+        self.loss_keypoints2d = build_loss(loss_keypoints2d)
+        self.loss_keypoints3d = build_loss(loss_keypoints3d)
+
+        self.loss_vertex = build_loss(loss_vertex)
+        self.loss_smpl_pose = build_loss(loss_smpl_pose)
+        self.loss_smpl_betas = build_loss(loss_smpl_betas)
+        self.loss_adv = build_loss(loss_adv)
+        self.loss_camera = build_loss(loss_camera)
+        self.loss_segm_mask = build_loss(loss_segm_mask)
+        set_requires_grad(self.body_model_train, False)
+        set_requires_grad(self.body_model_test, False)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """Train step function.
+
+        In this function, the detector will finish the train step following
+        the pipeline:
+        1. get fake and real SMPL parameters
+        2. optimize discriminator (if have)
+        3. optimize generator
+        If `self.train_cfg.disc_step > 1`, the train step will contain multiple
+        iterations for optimizing discriminator with different input data and
+        only one iteration for optimizing generator after `disc_step`
+        iterations for discriminator.
+        Args:
+            data_batch (torch.Tensor): Batch of data as input.
+            optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for
+                generator and discriminator (if have).
+        Returns:
+            outputs (dict): Dict with loss, information for logger,
+            the number of samples.
+        """
+        if self.backbone is not None:
+            img = data_batch['img']
+            features = self.backbone(img)
+        else:
+            features = data_batch['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        # NOTE: features and bbox_info taken as input for Cliff
+        bbox_info = data_batch['bbox_info']
+        predictions = self.head(features, bbox_info)
+        targets = self.prepare_targets(data_batch)
+
+        # optimize discriminator (if have)
+        if self.disc is not None:
+            self.optimize_discrinimator(predictions, data_batch, optimizer)
+
+        if self.registration is not None:
+            targets = self.run_registration(predictions, targets)
+
+        losses = self.compute_losses(predictions, targets)
+        # optimizer generator part
+        if self.disc is not None:
+            adv_loss = self.optimize_generator(predictions)
+            losses.update(adv_loss)
+
+        loss, log_vars = self._parse_losses(losses)
+        for key in optimizer.keys():
+            optimizer[key].zero_grad()
+        loss.backward()
+        for key in optimizer.keys():
+            optimizer[key].step()
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+        return outputs
+
+    def run_registration(
+            self,
+            predictions: dict,
+            targets: dict,
+            threshold: Optional[float] = 10.0,
+            focal_length: Optional[float] = 5000.0,
+            img_res: Optional[Union[Tuple[int], int]] = 224) -> dict:
+        """Run registration on 2D keypoinst in predictions to obtain SMPL
+        parameters as pseudo ground truth.
+
+        Args:
+            predictions (dict): predicted SMPL parameters are used for
+                initialization.
+            targets (dict): existing ground truths with 2D keypoints
+            threshold (float, optional): the threshold to update fits
+                dictionary. Default: 10.0.
+            focal_length (tuple(int) | int, optional): camera focal_length
+            img_res (int, optional): image resolution
+
+        Returns:
+            targets: contains additional SMPL parameters
+        """
+
+        img_metas = targets['img_metas']
+        dataset_name = [meta['dataset_name'] for meta in img_metas
+                        ]  # name of the dataset the image comes from
+
+        indices = targets['sample_idx'].squeeze()
+        is_flipped = targets['is_flipped'].squeeze().bool(
+        )  # flag that indicates whether image was flipped
+        # during data augmentation
+        rot_angle = targets['rotation'].squeeze(
+        )  # rotation angle used for data augmentation Q
+        gt_betas = targets['smpl_betas'].float()
+        gt_global_orient = targets['smpl_global_orient'].float()
+        gt_pose = targets['smpl_body_pose'].float().view(-1, 69)
+
+        pred_rotmat = predictions['pred_pose'].detach().clone()
+        pred_betas = predictions['pred_shape'].detach().clone()
+        pred_cam = predictions['pred_cam'].detach().clone()
+        pred_cam_t = torch.stack([
+            pred_cam[:, 1], pred_cam[:, 2], 2 * focal_length /
+            (img_res * pred_cam[:, 0] + 1e-9)
+        ],
+                                 dim=-1)
+
+        gt_keypoints_2d = targets['keypoints2d'].float()
+        num_keypoints = gt_keypoints_2d.shape[1]
+
+        has_smpl = targets['has_smpl'].view(
+            -1).bool()  # flag that indicates whether SMPL parameters are valid
+        batch_size = has_smpl.shape[0]
+        device = has_smpl.device
+
+        # Get GT vertices and model joints
+        # Note that gt_model_joints is different from gt_joints as
+        # it comes from SMPL
+        gt_out = self.body_model_train(
+            betas=gt_betas, body_pose=gt_pose, global_orient=gt_global_orient)
+        # TODO: support more convention
+        assert num_keypoints == 49
+        gt_model_joints = gt_out['joints']
+        gt_vertices = gt_out['vertices']
+
+        # Get current best fits from the dictionary
+        opt_pose, opt_betas = self.fits_dict[(dataset_name, indices.cpu(),
+                                              rot_angle.cpu(),
+                                              is_flipped.cpu())]
+
+        opt_pose = opt_pose.to(device)
+        opt_betas = opt_betas.to(device)
+        opt_output = self.body_model_train(
+            betas=opt_betas,
+            body_pose=opt_pose[:, 3:],
+            global_orient=opt_pose[:, :3])
+        opt_joints = opt_output['joints']
+        opt_vertices = opt_output['vertices']
+
+        gt_keypoints_2d_orig = gt_keypoints_2d.clone()
+        # Estimate camera translation given the model joints and 2D keypoints
+        # by minimizing a weighted least squares loss
+        gt_cam_t = estimate_translation(
+            gt_model_joints,
+            gt_keypoints_2d_orig,
+            focal_length=focal_length,
+            img_size=img_res)
+
+        opt_cam_t = estimate_translation(
+            opt_joints,
+            gt_keypoints_2d_orig,
+            focal_length=focal_length,
+            img_size=img_res)
+
+        with torch.no_grad():
+            loss_dict = self.registrant.evaluate(
+                global_orient=opt_pose[:, :3],
+                body_pose=opt_pose[:, 3:],
+                betas=opt_betas,
+                transl=opt_cam_t,
+                keypoints2d=gt_keypoints_2d_orig[:, :, :2],
+                keypoints2d_conf=gt_keypoints_2d_orig[:, :, 2],
+                reduction_override='none')
+        opt_joint_loss = loss_dict['keypoint2d_loss'].sum(dim=-1).sum(dim=-1)
+
+        if self.registration_mode == 'in_the_loop':
+            # Convert predicted rotation matrices to axis-angle
+            pred_rotmat_hom = torch.cat([
+                pred_rotmat.detach().view(-1, 3, 3).detach(),
+                torch.tensor([0, 0, 1], dtype=torch.float32,
+                             device=device).view(1, 3, 1).expand(
+                                 batch_size * 24, -1, -1)
+            ],
+                                        dim=-1)
+            pred_pose = rotation_matrix_to_angle_axis(
+                pred_rotmat_hom).contiguous().view(batch_size, -1)
+            # tgm.rotation_matrix_to_angle_axis returns NaN for 0 rotation,
+            # so manually hack it
+            pred_pose[torch.isnan(pred_pose)] = 0.0
+
+            registrant_output = self.registrant(
+                keypoints2d=gt_keypoints_2d_orig[:, :, :2],
+                keypoints2d_conf=gt_keypoints_2d_orig[:, :, 2],
+                init_global_orient=pred_pose[:, :3],
+                init_transl=pred_cam_t,
+                init_body_pose=pred_pose[:, 3:],
+                init_betas=pred_betas,
+                return_joints=True,
+                return_verts=True,
+                return_losses=True)
+            new_opt_vertices = registrant_output[
+                'vertices'] - pred_cam_t.unsqueeze(1)
+            new_opt_joints = registrant_output[
+                'joints'] - pred_cam_t.unsqueeze(1)
+
+            new_opt_global_orient = registrant_output['global_orient']
+            new_opt_body_pose = registrant_output['body_pose']
+            new_opt_pose = torch.cat(
+                [new_opt_global_orient, new_opt_body_pose], dim=1)
+
+            new_opt_betas = registrant_output['betas']
+            new_opt_cam_t = registrant_output['transl']
+            new_opt_joint_loss = registrant_output['keypoint2d_loss'].sum(
+                dim=-1).sum(dim=-1)
+
+            # Will update the dictionary for the examples where the new loss
+            # is less than the current one
+            update = (new_opt_joint_loss < opt_joint_loss)
+
+            opt_joint_loss[update] = new_opt_joint_loss[update]
+            opt_vertices[update, :] = new_opt_vertices[update, :]
+            opt_joints[update, :] = new_opt_joints[update, :]
+            opt_pose[update, :] = new_opt_pose[update, :]
+            opt_betas[update, :] = new_opt_betas[update, :]
+            opt_cam_t[update, :] = new_opt_cam_t[update, :]
+
+            self.fits_dict[(dataset_name, indices.cpu(), rot_angle.cpu(),
+                            is_flipped.cpu(),
+                            update.cpu())] = (opt_pose.cpu(), opt_betas.cpu())
+
+        # Replace extreme betas with zero betas
+        opt_betas[(opt_betas.abs() > 3).any(dim=-1)] = 0.
+
+        # Replace the optimized parameters with the ground truth parameters,
+        # if available
+        opt_vertices[has_smpl, :, :] = gt_vertices[has_smpl, :, :]
+        opt_cam_t[has_smpl, :] = gt_cam_t[has_smpl, :]
+        opt_joints[has_smpl, :, :] = gt_model_joints[has_smpl, :, :]
+        opt_pose[has_smpl, 3:] = gt_pose[has_smpl, :]
+        opt_pose[has_smpl, :3] = gt_global_orient[has_smpl, :]
+        opt_betas[has_smpl, :] = gt_betas[has_smpl, :]
+
+        # Assert whether a fit is valid by comparing the joint loss with
+        # the threshold
+        valid_fit = (opt_joint_loss < threshold).to(device)
+        valid_fit = valid_fit | has_smpl
+        targets['valid_fit'] = valid_fit
+
+        targets['opt_vertices'] = opt_vertices
+        targets['opt_cam_t'] = opt_cam_t
+        targets['opt_joints'] = opt_joints
+        targets['opt_pose'] = opt_pose
+        targets['opt_betas'] = opt_betas
+
+        return targets
+
+    def optimize_discrinimator(self, predictions: dict, data_batch: dict,
+                               optimizer: dict):
+        """Optimize discrinimator during adversarial training."""
+        set_requires_grad(self.disc, True)
+        fake_data = self.make_fake_data(predictions, requires_grad=False)
+        real_data = self.make_real_data(data_batch)
+        fake_score = self.disc(fake_data)
+        real_score = self.disc(real_data)
+
+        disc_losses = {}
+        disc_losses['real_loss'] = self.loss_adv(
+            real_score, target_is_real=True, is_disc=True)
+        disc_losses['fake_loss'] = self.loss_adv(
+            fake_score, target_is_real=False, is_disc=True)
+        loss_disc, log_vars_d = self._parse_losses(disc_losses)
+
+        optimizer['disc'].zero_grad()
+        loss_disc.backward()
+        optimizer['disc'].step()
+
+    def optimize_generator(self, predictions: dict):
+        """Optimize generator during adversarial training."""
+        set_requires_grad(self.disc, False)
+        fake_data = self.make_fake_data(predictions, requires_grad=True)
+        pred_score = self.disc(fake_data)
+        loss_adv = self.loss_adv(
+            pred_score, target_is_real=True, is_disc=False)
+        loss = dict(adv_loss=loss_adv)
+        return loss
+
+    def compute_keypoints3d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            gt_keypoints3d: torch.Tensor,
+            has_keypoints3d: Optional[torch.Tensor] = None):
+        """Compute loss for 3d keypoints."""
+        keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1)
+        keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3)
+        pred_keypoints3d = pred_keypoints3d.float()
+        gt_keypoints3d = gt_keypoints3d[:, :, :3].float()
+
+        # currently, only mpi_inf_3dhp and h36m have 3d keypoints
+        # both datasets have right_hip_extra and left_hip_extra
+        right_hip_idx = get_keypoint_idx('right_hip_extra', self.convention)
+        left_hip_idx = get_keypoint_idx('left_hip_extra', self.convention)
+        gt_pelvis = (gt_keypoints3d[:, right_hip_idx, :] +
+                     gt_keypoints3d[:, left_hip_idx, :]) / 2
+        pred_pelvis = (pred_keypoints3d[:, right_hip_idx, :] +
+                       pred_keypoints3d[:, left_hip_idx, :]) / 2
+
+        gt_keypoints3d = gt_keypoints3d - gt_pelvis[:, None, :]
+        pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, None, :]
+        loss = self.loss_keypoints3d(
+            pred_keypoints3d, gt_keypoints3d, reduction_override='none')
+
+        # If has_keypoints3d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints3d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints3d
+        # which have positive confidence.
+
+        # has_keypoints3d is None when the key has_keypoints3d
+        # is not in the datasets
+        if has_keypoints3d is None:
+
+            valid_pos = keypoints3d_conf > 0
+            if keypoints3d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = torch.sum(loss * keypoints3d_conf)
+            loss /= keypoints3d_conf[valid_pos].numel()
+        else:
+
+            keypoints3d_conf = keypoints3d_conf[has_keypoints3d == 1]
+            if keypoints3d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = loss[has_keypoints3d == 1]
+            loss = (loss * keypoints3d_conf).mean()
+        return loss
+
+    def compute_keypoints2d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            pred_cam: torch.Tensor,
+            gt_keypoints2d: torch.Tensor,
+            img_res: Optional[int] = 224,
+            focal_length: Optional[int] = 5000,
+            has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        pred_keypoints2d = project_points(
+            pred_keypoints3d,
+            pred_cam,
+            focal_length=focal_length,
+            img_res=img_res)
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_keypoints_2d is
+        # the center of the input image.
+        pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1)
+        # The coordinate origin of gt_keypoints_2d is
+        # the top left corner of the input image.
+        gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1
+        loss = self.loss_keypoints2d(
+            pred_keypoints2d, gt_keypoints2d, reduction_override='none')
+
+        # If has_keypoints2d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints2d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints2d
+        # which have positive confidence.
+        # has_keypoints2d is None when the key has_keypoints2d
+        # is not in the datasets
+
+        if has_keypoints2d is None:
+            valid_pos = keypoints2d_conf > 0
+            if keypoints2d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = torch.sum(loss * keypoints2d_conf)
+            loss /= keypoints2d_conf[valid_pos].numel()
+        else:
+            keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1]
+            if keypoints2d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = loss[has_keypoints2d == 1]
+            loss = (loss * keypoints2d_conf).mean()
+
+        return loss
+
+    def compute_keypoints2d_loss_cliff(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            pred_cam: torch.Tensor,
+            gt_keypoints2d: torch.Tensor,
+            camera_center: torch.Tensor,
+            focal_length: torch.Tensor,
+            trans: torch.Tensor,
+            img_res: Optional[int] = 224,
+            has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+
+        device = gt_keypoints2d.device
+        batch_size, num_keypoints = pred_keypoints3d.shape[0:2]
+
+        pred_keypoints2d = perspective_projection(
+            pred_keypoints3d,
+            rotation=torch.eye(3, device=device).unsqueeze(0).expand(
+                batch_size, -1, -1),
+            translation=pred_cam,
+            focal_length=focal_length,
+            camera_center=camera_center)
+
+        pred_keypoints2d = torch.cat(
+            (pred_keypoints2d, torch.ones(batch_size, num_keypoints,
+                                          1).to(device)),
+            dim=2)
+        # trans @ pred_keypoints2d2
+        pred_keypoints2d = torch.einsum('bij,bkj->bki', trans,
+                                        pred_keypoints2d)
+
+        # The coordinate origin of pred_keypoints_2d and gt_keypoints_2d is
+        # the top left corner of the input image.
+        pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1) - 1
+        gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1
+        loss = self.loss_keypoints2d(
+            pred_keypoints2d, gt_keypoints2d, reduction_override='none')
+
+        # If has_keypoints2d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints2d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints2d
+        # which have positive confidence.
+        # has_keypoints2d is None when the key has_keypoints2d
+        # is not in the datasets
+
+        if has_keypoints2d is None:
+            valid_pos = keypoints2d_conf > 0
+            if keypoints2d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = torch.sum(loss * keypoints2d_conf)
+            loss /= keypoints2d_conf[valid_pos].numel()
+        else:
+            keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1]
+            if keypoints2d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = loss[has_keypoints2d == 1]
+            loss = (loss * keypoints2d_conf).mean()
+
+        return loss
+
+    def compute_vertex_loss(self, pred_vertices: torch.Tensor,
+                            gt_vertices: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for vertices."""
+        gt_vertices = gt_vertices.float()
+        conf = has_smpl.float().view(-1, 1, 1)
+        conf = conf.repeat(1, gt_vertices.shape[1], gt_vertices.shape[2])
+        loss = self.loss_vertex(
+            pred_vertices, gt_vertices, reduction_override='none')
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_vertices)
+        loss = torch.sum(loss * conf) / conf[valid_pos].numel()
+        return loss
+
+    def compute_smpl_pose_loss(self, pred_rotmat: torch.Tensor,
+                               gt_pose: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for smpl pose."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_pose)
+        pred_rotmat = pred_rotmat[valid_pos]
+        gt_pose = gt_pose[valid_pos]
+        conf = conf[valid_pos]
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss = self.loss_smpl_pose(
+            pred_rotmat, gt_rotmat, reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_smpl_betas_loss(self, pred_betas: torch.Tensor,
+                                gt_betas: torch.Tensor,
+                                has_smpl: torch.Tensor):
+        """Compute loss for smpl betas."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_betas)
+        pred_betas = pred_betas[valid_pos]
+        gt_betas = gt_betas[valid_pos]
+        conf = conf[valid_pos]
+        loss = self.loss_smpl_betas(
+            pred_betas, gt_betas, reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_camera_loss(self, cameras: torch.Tensor):
+        """Compute loss for predicted camera parameters."""
+        loss = self.loss_camera(cameras)
+        return loss
+
+    def compute_part_segmentation_loss(self,
+                                       pred_heatmap: torch.Tensor,
+                                       gt_vertices: torch.Tensor,
+                                       gt_keypoints2d: torch.Tensor,
+                                       gt_model_joints: torch.Tensor,
+                                       has_smpl: torch.Tensor,
+                                       img_res: Optional[int] = 224,
+                                       focal_length: Optional[int] = 500):
+        """Compute loss for part segmentations."""
+        device = gt_keypoints2d.device
+        gt_keypoints2d_valid = gt_keypoints2d[has_smpl == 1]
+        batch_size = gt_keypoints2d_valid.shape[0]
+
+        gt_vertices_valid = gt_vertices[has_smpl == 1]
+        gt_model_joints_valid = gt_model_joints[has_smpl == 1]
+
+        if batch_size == 0:
+            return torch.Tensor([0]).type_as(gt_keypoints2d)
+        gt_cam_t = estimate_translation(
+            gt_model_joints_valid,
+            gt_keypoints2d_valid,
+            focal_length=focal_length,
+            img_size=img_res,
+        )
+
+        K = torch.eye(3)
+        K[0, 0] = focal_length
+        K[1, 1] = focal_length
+        K[2, 2] = 1
+        K[0, 2] = img_res / 2.
+        K[1, 2] = img_res / 2.
+        K = K[None, :, :]
+
+        R = torch.eye(3)[None, :, :]
+        device = gt_keypoints2d.device
+        gt_sem_mask = visualize_smpl.render_smpl(
+            verts=gt_vertices_valid,
+            R=R,
+            K=K,
+            T=gt_cam_t,
+            render_choice='part_silhouette',
+            resolution=img_res,
+            return_tensor=True,
+            body_model=self.body_model_train,
+            device=device,
+            in_ndc=False,
+            convention='pytorch3d',
+            projection='perspective',
+            no_grad=True,
+            batch_size=batch_size,
+            verbose=False,
+        )
+        gt_sem_mask = torch.flip(gt_sem_mask, [1, 2]).squeeze(-1).detach()
+        pred_heatmap_valid = pred_heatmap[has_smpl == 1]
+        ph, pw = pred_heatmap_valid.size(2), pred_heatmap_valid.size(3)
+        h, w = gt_sem_mask.size(1), gt_sem_mask.size(2)
+        if ph != h or pw != w:
+            pred_heatmap_valid = F.interpolate(
+                input=pred_heatmap_valid, size=(h, w), mode='bilinear')
+
+        loss = self.loss_segm_mask(pred_heatmap_valid, gt_sem_mask)
+        return loss
+
+    def compute_losses(self, predictions: dict, targets: dict):
+        """Compute losses."""
+        pred_betas = predictions['pred_shape'].view(-1, 10)
+        pred_pose = predictions['pred_pose'].view(-1, 24, 3, 3)
+        pred_cam_crop = predictions['pred_cam'].view(-1, 3)
+
+        # NOTE: convert cam parameters from the crop to the full camera
+        img_h, img_w = targets['img_h'], targets['img_w']
+        center, scale, focal_length = targets['center'], targets[
+            'scale'][:, 0], targets['focal_length'].squeeze(dim=1)
+        full_img_shape = torch.hstack((img_h, img_w))
+        pred_cam = cam_crop2full(pred_cam_crop, center, scale, full_img_shape,
+                                 focal_length).to(torch.float32)
+
+        gt_keypoints3d = targets['keypoints3d']
+        # this should be in full frame
+        gt_keypoints2d = targets['keypoints2d']
+        # pred_pose N, 24, 3, 3
+        if self.body_model_train is not None:
+            pred_output = self.body_model_train(
+                betas=pred_betas,
+                body_pose=pred_pose[:, 1:],
+                global_orient=pred_pose[:, 0].unsqueeze(1),
+                pose2rot=False,
+                num_joints=gt_keypoints2d.shape[1])
+            pred_keypoints3d = pred_output['joints']
+            pred_vertices = pred_output['vertices']
+
+        # NOTE: use crop_trans to contain full -> crop so that pred keypoints
+        # are normalized to bbox
+        camera_center = torch.hstack((img_w, img_h)) / 2
+        trans = targets['crop_trans'].float()
+
+        # TODO: temp solution
+        if 'valid_fit' in targets:
+            has_smpl = targets['valid_fit'].view(-1)
+            # global_orient = targets['opt_pose'][:, :3].view(-1, 1, 3)
+            gt_pose = targets['opt_pose']
+            gt_betas = targets['opt_betas']
+            gt_vertices = targets['opt_vertices']
+        else:
+            has_smpl = targets['has_smpl'].view(-1)
+            gt_pose = targets['smpl_body_pose']
+            global_orient = targets['smpl_global_orient'].view(-1, 1, 3)
+            gt_pose = torch.cat((global_orient, gt_pose), dim=1).float()
+            gt_betas = targets['smpl_betas'].float()
+
+            # gt_pose N, 72
+            if self.body_model_train is not None:
+                gt_output = self.body_model_train(
+                    betas=gt_betas,
+                    body_pose=gt_pose[:, 3:],
+                    global_orient=gt_pose[:, :3],
+                    num_joints=gt_keypoints2d.shape[1])
+                gt_vertices = gt_output['vertices']
+                gt_model_joints = gt_output['joints']
+        if 'has_keypoints3d' in targets:
+            has_keypoints3d = targets['has_keypoints3d'].squeeze(-1)
+        else:
+            has_keypoints3d = None
+        if 'has_keypoints2d' in targets:
+            has_keypoints2d = targets['has_keypoints2d'].squeeze(-1)
+        else:
+            has_keypoints2d = None
+        if 'pred_segm_mask' in predictions:
+            pred_segm_mask = predictions['pred_segm_mask']
+        losses = {}
+        if self.loss_keypoints3d is not None:
+            losses['keypoints3d_loss'] = self.compute_keypoints3d_loss(
+                pred_keypoints3d,
+                gt_keypoints3d,
+                has_keypoints3d=has_keypoints3d)
+        if self.loss_keypoints2d is not None:
+            losses['keypoints2d_loss'] = self.compute_keypoints2d_loss_cliff(
+                pred_keypoints3d,
+                pred_cam,
+                gt_keypoints2d,
+                camera_center,
+                focal_length,
+                trans,
+                has_keypoints2d=has_keypoints2d)
+        if self.loss_vertex is not None:
+            losses['vertex_loss'] = self.compute_vertex_loss(
+                pred_vertices, gt_vertices, has_smpl)
+        if self.loss_smpl_pose is not None:
+            losses['smpl_pose_loss'] = self.compute_smpl_pose_loss(
+                pred_pose, gt_pose, has_smpl)
+        if self.loss_smpl_betas is not None:
+            losses['smpl_betas_loss'] = self.compute_smpl_betas_loss(
+                pred_betas, gt_betas, has_smpl)
+        if self.loss_camera is not None:
+            losses['camera_loss'] = self.compute_camera_loss(pred_cam)
+        if self.loss_segm_mask is not None:
+            losses['loss_segm_mask'] = self.compute_part_segmentation_loss(
+                pred_segm_mask, gt_vertices, gt_keypoints2d, gt_model_joints,
+                has_smpl)
+
+        return losses
+
+    @abstractmethod
+    def make_fake_data(self, predictions, requires_grad):
+        pass
+
+    @abstractmethod
+    def make_real_data(self, data_batch):
+        pass
+
+    @abstractmethod
+    def prepare_targets(self, data_batch):
+        pass
+
+    def forward_train(self, **kwargs):
+        """Forward function for general training.
+
+        For mesh estimation, we do not use this interface.
+        """
+        raise NotImplementedError('This interface should not be used in '
+                                  'current training schedule. Please use '
+                                  '`train_step` for training.')
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        pass
+
+
+class CliffImageBodyModelEstimator(BodyModelEstimator):
+
+    def make_fake_data(self, predictions: dict, requires_grad: bool):
+        pred_cam = predictions['pred_cam']
+        pred_pose = predictions['pred_pose']
+        pred_betas = predictions['pred_shape']
+        if requires_grad:
+            fake_data = (pred_cam, pred_pose, pred_betas)
+        else:
+            fake_data = (pred_cam.detach(), pred_pose.detach(),
+                         pred_betas.detach())
+        return fake_data
+
+    def make_real_data(self, data_batch: dict):
+        transl = data_batch['adv_smpl_transl'].float()
+        global_orient = data_batch['adv_smpl_global_orient']
+        body_pose = data_batch['adv_smpl_body_pose']
+        betas = data_batch['adv_smpl_betas'].float()
+        pose = torch.cat((global_orient, body_pose), dim=-1).float()
+        real_data = (transl, pose, betas)
+        return real_data
+
+    def prepare_targets(self, data_batch: dict):
+        # Image Mesh Estimator does not need extra process for ground truth
+        return data_batch
+
+    def forward_test(self, img: torch.Tensor, img_metas: dict, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        if self.backbone is not None:
+            features = self.backbone(img)
+        else:
+            features = kwargs['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        # NOTE: extras for Cliff inference
+        bbox_info = kwargs['bbox_info']
+        predictions = self.head(features, bbox_info)
+        pred_pose = predictions['pred_pose']
+        pred_betas = predictions['pred_shape']
+        pred_cam_crop = predictions['pred_cam'].view(-1, 3)
+
+        # convert the camera parameters from the crop camera to the full camera
+        img_h, img_w = kwargs['img_h'], kwargs['img_w']
+        center, scale, focal_length = kwargs['center'], kwargs[
+            'scale'][:, 0], kwargs['focal_length'].squeeze(dim=1)
+        full_img_shape = torch.hstack((img_h, img_w))
+
+        pred_cam = cam_crop2full(pred_cam_crop, center, scale, full_img_shape,
+                                 focal_length).to(torch.float32)
+
+        pred_output = self.body_model_test(
+            betas=pred_betas,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, 0].unsqueeze(1),
+            pose2rot=False)
+
+        pred_vertices = pred_output['vertices']
+        pred_keypoints_3d = pred_output['joints']
+        all_preds = {}
+        all_preds['keypoints_3d'] = pred_keypoints_3d.detach().cpu().numpy()
+        all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy()
+        all_preds['smpl_beta'] = pred_betas.detach().cpu().numpy()
+        all_preds['camera'] = pred_cam.detach().cpu().numpy()
+        all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+        image_path = []
+        for img_meta in img_metas:
+            image_path.append(img_meta['image_path'])
+        all_preds['image_path'] = image_path
+        all_preds['image_idx'] = kwargs['sample_idx']
+        return all_preds
diff --git a/mmhuman3d/models/heads/builder.py b/mmhuman3d/models/heads/builder.py
index 5e15c8ef..bdaa7e12 100644
--- a/mmhuman3d/models/heads/builder.py
+++ b/mmhuman3d/models/heads/builder.py
@@ -2,6 +2,7 @@
 
 from mmcv.utils import Registry
 
+from .cliff_head import CliffHead
 from .expose_head import ExPoseBodyHead, ExPoseFaceHead, ExPoseHandHead
 from .hmr_head import HMRHead
 from .hybrik_head import HybrIKHead
@@ -16,6 +17,7 @@
 HEADS.register_module(name='ExPoseBodyHead', module=ExPoseBodyHead)
 HEADS.register_module(name='ExPoseHandHead', module=ExPoseHandHead)
 HEADS.register_module(name='ExPoseFaceHead', module=ExPoseFaceHead)
+HEADS.register_module(name='CliffHead', module=CliffHead)
 HEADS.register_module(name='PyMAFXHead', module=PyMAFXHead)
 HEADS.register_module(name='Regressor', module=Regressor)
 
diff --git a/mmhuman3d/models/heads/cliff_head.py b/mmhuman3d/models/heads/cliff_head.py
new file mode 100644
index 00000000..037e37d1
--- /dev/null
+++ b/mmhuman3d/models/heads/cliff_head.py
@@ -0,0 +1,98 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.runner.base_module import BaseModule
+
+from mmhuman3d.utils.geometry import rot6d_to_rotmat
+
+
+class CliffHead(BaseModule):
+
+    def __init__(self,
+                 feat_dim,
+                 smpl_mean_params=None,
+                 npose=144,
+                 nbeta=10,
+                 ncam=3,
+                 nbbox=3,
+                 hdim=1024,
+                 init_cfg=None):
+        super(CliffHead, self).__init__(init_cfg=init_cfg)
+        self.fc1 = nn.Linear(feat_dim + nbbox + npose + nbeta + ncam, hdim)
+        self.drop1 = nn.Dropout()
+        self.fc2 = nn.Linear(hdim, hdim)
+        self.drop2 = nn.Dropout()
+        self.decpose = nn.Linear(hdim, npose)
+        self.decshape = nn.Linear(hdim, nbeta)
+        self.deccam = nn.Linear(hdim, ncam)
+
+        nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
+
+        if smpl_mean_params is None:
+            init_pose = torch.zeros([1, npose])
+            init_shape = torch.zeros([1, nbeta])
+            init_cam = torch.FloatTensor([[1, 0, 0]])
+        else:
+            mean_params = np.load(smpl_mean_params)
+            init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+            init_shape = torch.from_numpy(
+                mean_params['shape'][:].astype('float32')).unsqueeze(0)
+            init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+    def forward(self,
+                x,
+                bbox_info,
+                init_pose=None,
+                init_shape=None,
+                init_cam=None,
+                n_iter=3):
+
+        # inherited from hmr head, only support one layer feature
+        if isinstance(x, list) or isinstance(x, tuple):
+            x = x[-1]
+
+        output_seq = False
+        if len(x.shape) == 4:
+            # use feature from the last layer of the backbone
+            # apply global average pooling on the feature map
+            x = x.mean(dim=-1).mean(dim=-1)
+        elif len(x.shape) == 3:
+            # temporal feature
+            raise NotImplementedError
+
+        batch_size = x.shape[0]
+        if init_pose is None:
+            init_pose = self.init_pose.expand(batch_size, -1)
+        if init_shape is None:
+            init_shape = self.init_shape.expand(batch_size, -1)
+        if init_cam is None:
+            init_cam = self.init_cam.expand(batch_size, -1)
+
+        pred_pose = init_pose
+        pred_shape = init_shape
+        pred_cam = init_cam
+        for i in range(n_iter):
+            xc = torch.cat([x, bbox_info, pred_pose, pred_shape, pred_cam], 1)
+            xc = self.fc1(xc)
+            xc = self.drop1(xc)
+            xc = self.fc2(xc)
+            xc = self.drop2(xc)
+            pred_pose = self.decpose(xc) + pred_pose
+            pred_shape = self.decshape(xc) + pred_shape
+            pred_cam = self.deccam(xc) + pred_cam
+
+        pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
+
+        if output_seq:
+            raise NotImplementedError
+        output = {
+            'pred_pose': pred_rotmat,
+            'pred_shape': pred_shape,
+            'pred_cam': pred_cam
+        }
+        return output
diff --git a/mmhuman3d/utils/geometry.py b/mmhuman3d/utils/geometry.py
index 88dcdccd..09a8cef4 100644
--- a/mmhuman3d/utils/geometry.py
+++ b/mmhuman3d/utils/geometry.py
@@ -417,6 +417,27 @@ def weak_perspective_projection(points, scale, translation):
     return projected_points
 
 
+def cam_crop2full(crop_cam, center, scale, full_img_shape, focal_length):
+    """convert the camera parameters from the crop camera to the full camera.
+
+    :param crop_cam: shape=(N, 3) weak perspective camera in cropped
+       img coordinates (s, tx, ty)
+    :param center: shape=(N, 2) bbox coordinates (c_x, c_y)
+    :param scale: shape=(N, 1) square bbox resolution  (b / 200)
+    :param full_img_shape: shape=(N, 2) original image height and width
+    :param focal_length: shape=(N,)
+    :return:
+    """
+    img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1]
+    cx, cy, b = center[:, 0], center[:, 1], scale
+    bs = b * crop_cam[:, 0] + 1e-9
+    tz = 2 * focal_length / bs
+    tx = (2 * (cx - img_w / 2.) / bs) + crop_cam[:, 1]
+    ty = (2 * (cy - img_h / 2.) / bs) + crop_cam[:, 2]
+    full_cam = torch.stack([tx, ty, tz], dim=-1)
+    return full_cam
+
+
 def projection(pred_joints, pred_camera, iwp_mode=True):
     """Project 3D points on the image plane based on the given camera info,
     Identity rotation and Weak Perspective (IWP) camera is used when
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 40562562..cb4edc25 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -8,7 +8,7 @@ h5py
 matplotlib
 numpy
 opencv-python
-pandas
+pandas<2.0.0
 pickle5
 plyfile
 rtree
diff --git a/setup.cfg b/setup.cfg
index 8bc3c131..a5899aa2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,6 +15,6 @@ multi_line_output = 3
 include_trailing_comma = true
 known_standard_library = pkg_resources,setuptools
 known_first_party = mmhuman3d
-known_third_party =PIL,cdflib,colormap,cv2,einops,h5py,matplotlib,mmcv,mpl_toolkits,numpy,openpifpaf,pickle5,plyfile,pytest,pytorch3d,pytorch_sphinx_theme,scipy,skimage,smplx,surrogate,torch,tqdm,trimesh,vedo
+known_third_party = PIL,cdflib,colormap,cv2,einops,h5py,matplotlib,mmcv,mpl_toolkits,numpy,openpifpaf,pickle5,plyfile,pytest,pytorch3d,pytorch_sphinx_theme,scipy,skimage,smplx,surrogate,torch,tqdm,trimesh,vedo
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
diff --git a/tests/test_data_converters.py b/tests/test_data_converters.py
index 4210e2fb..b9e77f80 100644
--- a/tests/test_data_converters.py
+++ b/tests/test_data_converters.py
@@ -323,6 +323,14 @@ def test_multi_human_data_preprocess():
     assert os.path.exists('/tmp/preprocessed_npzs/' + 'crowdpose_test.npz')
     assert os.path.exists('/tmp/preprocessed_npzs/' + 'crowdpose_trainval.npz')
 
+    CLIFF_ROOT = os.path.join(root_path, 'eft')
+    cfg = dict(type='CliffConverter', modes=['coco', 'mpii'])
+    data_converter = build_data_converter(cfg)
+    data_converter.convert(
+        CLIFF_ROOT, output_path, enable_multi_human_data=True)
+    assert os.path.exists('/tmp/preprocessed_npzs/' + 'cliff_coco_train.npz')
+    assert os.path.exists('/tmp/preprocessed_npzs/' + 'cliff_mpii_train.npz')
+
 
 def test_preprocessed_npz():
     npz_folder = '/tmp/preprocessed_npzs'
diff --git a/tests/test_datasets/test_pipelines.py b/tests/test_datasets/test_pipelines.py
index 4e0daf61..f14a003a 100644
--- a/tests/test_datasets/test_pipelines.py
+++ b/tests/test_datasets/test_pipelines.py
@@ -2,6 +2,7 @@
 import pytest
 
 from mmhuman3d.data.datasets.pipelines import (
+    GetBboxInfo,
     LoadImageFromFile,
     SyntheticOcclusion,
 )
@@ -57,3 +58,17 @@ def test_synthetic_occlusion():
 
     results = pipeline(results)
     assert results['img'].shape == (224, 224, 3)
+
+
+def test_get_bbox_inf():
+    pipeline = GetBboxInfo()
+    results = {
+        'img': np.ones((224, 224, 3)),
+        'center': np.array([100, 100]),
+        'scale': np.array([10, 10])
+    }
+    pipeline(results=results)
+    assert 'img_h' in results
+    assert 'img_w' in results
+    assert 'focal_length' in results
+    assert 'bbox_info' in results
diff --git a/tests/test_models/test_architectures/test_cliff_mesh_estimator.py b/tests/test_models/test_architectures/test_cliff_mesh_estimator.py
new file mode 100644
index 00000000..2896d787
--- /dev/null
+++ b/tests/test_models/test_architectures/test_cliff_mesh_estimator.py
@@ -0,0 +1,417 @@
+import torch
+
+from mmhuman3d.core.cameras import build_cameras
+from mmhuman3d.models.architectures.cliff_mesh_estimator import \
+    CliffImageBodyModelEstimator  # noqa: E501
+from mmhuman3d.models.body_models.builder import build_body_model
+from mmhuman3d.utils.geometry import project_points
+
+
+def test_cliff_image_body_mesh_estimator():
+    backbone = dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[3],
+        norm_eval=False,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'))
+    head = dict(
+        type='CliffHead',
+        feat_dim=2048,
+        smpl_mean_params='data/body_models/smpl_mean_params.npz')
+    body_model_train = dict(
+        type='SMPL',
+        keypoint_src='smpl_54',
+        keypoint_dst='smpl_54',
+        model_path='data/body_models/smpl',
+        keypoint_approximate=True,
+        extra_joints_regressor='data/body_models/J_regressor_extra.npy')
+    body_model_test = dict(
+        type='SMPL',
+        keypoint_src='h36m',
+        keypoint_dst='h36m',
+        model_path='data/body_models/smpl',
+        joints_regressor='data/body_models/J_regressor_h36m.npy')
+    convention = 'smpl_54'
+    loss_keypoints3d = dict(type='SmoothL1Loss', loss_weight=100)
+    loss_keypoints2d = dict(type='SmoothL1Loss', loss_weight=10)
+    loss_vertex = dict(type='L1Loss', loss_weight=2)
+    loss_smpl_pose = dict(type='MSELoss', loss_weight=3)
+    loss_smpl_betas = dict(type='MSELoss', loss_weight=0.02)
+    loss_adv = dict(
+        type='GANLoss',
+        gan_type='lsgan',
+        real_label_val=1.0,
+        fake_label_val=0.0,
+        loss_weight=1)
+    model = CliffImageBodyModelEstimator(
+        backbone=backbone,
+        head=head,
+        body_model_train=body_model_train,
+        body_model_test=body_model_test,
+        convention=convention,
+        loss_keypoints3d=loss_keypoints3d,
+        loss_keypoints2d=loss_keypoints2d,
+        loss_vertex=loss_vertex,
+        loss_smpl_pose=loss_smpl_pose,
+        loss_smpl_betas=loss_smpl_betas,
+        loss_adv=loss_adv)
+    assert model.backbone is not None
+    assert model.head is not None
+    assert model.body_model_train is not None
+    assert model.body_model_test is not None
+    assert model.convention == 'smpl_54'
+    assert model.loss_keypoints3d is not None
+    assert model.loss_keypoints2d is not None
+    assert model.loss_vertex is not None
+    assert model.loss_smpl_pose is not None
+    assert model.loss_smpl_betas is not None
+    assert model.loss_adv is not None
+
+
+def test_compute_keypoints3d_loss():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54',
+        loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100))
+
+    pred_keypoints3d = torch.zeros((32, 54, 3))
+    gt_keypoints3d = torch.zeros((32, 54, 4))
+    loss_empty = model.compute_keypoints3d_loss(pred_keypoints3d,
+                                                gt_keypoints3d)
+    assert loss_empty == 0
+
+    pred_keypoints3d = torch.randn((32, 54, 3))
+    gt_keypoints3d = torch.randn((32, 54, 4))
+    gt_keypoints3d[:, :, 3] = torch.sigmoid(gt_keypoints3d[:, :, 3])
+    loss = model.compute_keypoints3d_loss(pred_keypoints3d, gt_keypoints3d)
+    assert loss > 0
+
+    has_keypoints3d = torch.ones(32)
+    loss = model.compute_keypoints3d_loss(
+        pred_keypoints3d, gt_keypoints3d, has_keypoints3d=has_keypoints3d)
+    assert loss > 0
+    has_keypoints3d = torch.zeros(32)
+    loss = model.compute_keypoints3d_loss(
+        pred_keypoints3d, gt_keypoints3d, has_keypoints3d=has_keypoints3d)
+    assert loss == 0
+
+
+def test_compute_keypoints2d_loss_cliff():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54',
+        loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10))
+
+    pred_keypoints3d = torch.zeros((32, 54, 3))
+    gt_keypoints2d = torch.zeros((32, 54, 3))
+    pred_cam = torch.randn((32, 3))
+    camera_center = torch.randn((32, 2))
+    trans = torch.randn((32, 2, 3))
+    focal_length = 5000
+    loss_empty = model.compute_keypoints2d_loss_cliff(pred_keypoints3d,
+                                                      pred_cam, gt_keypoints2d,
+                                                      camera_center,
+                                                      focal_length, trans)
+    assert loss_empty == 0
+
+    pred_keypoints3d = torch.randn((32, 54, 3))
+    gt_keypoints2d = torch.randn((32, 54, 3))
+    gt_keypoints2d[:, :, 2] = torch.sigmoid(gt_keypoints2d[:, :, 2])
+    pred_cam = torch.randn((32, 3))
+    loss = model.compute_keypoints2d_loss_cliff(pred_keypoints3d, pred_cam,
+                                                gt_keypoints2d, camera_center,
+                                                focal_length, trans)
+    assert loss > 0
+
+    has_keypoints2d = torch.ones((32))
+    loss = model.compute_keypoints2d_loss_cliff(
+        pred_keypoints3d,
+        pred_cam,
+        gt_keypoints2d,
+        camera_center,
+        focal_length,
+        trans,
+        has_keypoints2d=has_keypoints2d)
+    assert loss > 0
+
+    has_keypoints2d = torch.zeros((32))
+    loss = model.compute_keypoints2d_loss_cliff(
+        pred_keypoints3d,
+        pred_cam,
+        gt_keypoints2d,
+        camera_center,
+        focal_length,
+        trans,
+        has_keypoints2d=has_keypoints2d)
+    assert loss == 0
+
+
+def test_compute_keypoints2d_loss():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54',
+        loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10))
+
+    pred_keypoints3d = torch.zeros((32, 54, 3))
+    gt_keypoints2d = torch.zeros((32, 54, 3))
+    pred_cam = torch.randn((32, 3))
+    loss_empty = model.compute_keypoints2d_loss(pred_keypoints3d, pred_cam,
+                                                gt_keypoints2d)
+    assert loss_empty == 0
+
+    pred_keypoints3d = torch.randn((32, 54, 3))
+    gt_keypoints2d = torch.randn((32, 54, 3))
+    gt_keypoints2d[:, :, 2] = torch.sigmoid(gt_keypoints2d[:, :, 2])
+    pred_cam = torch.randn((32, 3))
+    loss = model.compute_keypoints2d_loss(pred_keypoints3d, pred_cam,
+                                          gt_keypoints2d)
+    assert loss > 0
+
+    has_keypoints2d = torch.ones((32))
+    loss = model.compute_keypoints2d_loss(
+        pred_keypoints3d,
+        pred_cam,
+        gt_keypoints2d,
+        has_keypoints2d=has_keypoints2d)
+    assert loss > 0
+
+    has_keypoints2d = torch.zeros((32))
+    loss = model.compute_keypoints2d_loss(
+        pred_keypoints3d,
+        pred_cam,
+        gt_keypoints2d,
+        has_keypoints2d=has_keypoints2d)
+    assert loss == 0
+
+
+def test_compute_vertex_loss():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54', loss_vertex=dict(type='L1Loss', loss_weight=2))
+
+    pred_vertices = torch.randn((32, 4096, 3))
+    gt_vertices = torch.randn((32, 4096, 3))
+    has_smpl = torch.zeros((32))
+    loss_empty = model.compute_vertex_loss(pred_vertices, gt_vertices,
+                                           has_smpl)
+    assert loss_empty == 0
+
+    pred_vertices = torch.randn((32, 4096, 3))
+    gt_vertices = torch.randn((32, 4096, 3))
+    has_smpl = torch.ones((32))
+    loss = model.compute_vertex_loss(pred_vertices, gt_vertices, has_smpl)
+    assert loss > 0
+
+
+def test_compute_smpl_pose_loss():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54',
+        loss_smpl_pose=dict(type='MSELoss', loss_weight=3))
+
+    pred_rotmat = torch.randn((32, 24, 3, 3))
+    gt_pose = torch.randn((32, 24, 3))
+    has_smpl = torch.zeros((32))
+    loss_empty = model.compute_smpl_pose_loss(pred_rotmat, gt_pose, has_smpl)
+    assert loss_empty == 0
+
+    pred_rotmat = torch.randn((32, 24, 3, 3))
+    gt_pose = torch.randn((32, 24, 3))
+    has_smpl = torch.ones((32))
+    loss = model.compute_smpl_pose_loss(pred_rotmat, gt_pose, has_smpl)
+    assert loss > 0
+
+
+def test_compute_part_segm_loss():
+    N = 1
+    random_body_pose = torch.rand((N, 69))
+    body_model_train = dict(
+        type='SMPL',
+        keypoint_src='smpl_54',
+        keypoint_dst='smpl_49',
+        model_path='data/body_models/smpl',
+        extra_joints_regressor='data/body_models/J_regressor_extra.npy')
+    body_model = build_body_model(body_model_train)
+
+    body_model_output = body_model(body_pose=random_body_pose, )
+    gt_model_joins = body_model_output['joints'].detach()
+    cam = torch.ones(N, 3)
+    gt_keypoints2d = project_points(
+        gt_model_joins, cam, focal_length=5000, img_res=224)
+    loss_segm_mask = dict(type='CrossEntropyLoss', loss_weight=60)
+
+    gt_keypoints2d = torch.cat([gt_keypoints2d, torch.ones(N, 49, 1)], dim=-1)
+    model = CliffImageBodyModelEstimator(
+        body_model_train=body_model_train,
+        loss_segm_mask=loss_segm_mask,
+    )
+    gt_vertices = torch.randn(N, 6890, 3)
+    pred_heatmap = torch.zeros(N, 25, 224, 224)
+    pred_heatmap[:, 0, :, :] = 1
+    has_smpl = torch.ones((N))
+
+    loss = model.compute_part_segmentation_loss(
+        pred_heatmap,
+        gt_vertices,
+        has_smpl=has_smpl,
+        gt_keypoints2d=gt_keypoints2d,
+        gt_model_joints=gt_model_joins)
+    assert loss > 0
+
+
+def test_compute_smpl_betas_loss():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54',
+        loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02))
+
+    pred_betas = torch.randn((32, 10))
+    gt_betas = torch.randn((32, 10))
+    has_smpl = torch.zeros((32))
+    loss_empty = model.compute_smpl_betas_loss(pred_betas, gt_betas, has_smpl)
+    assert loss_empty == 0
+
+    pred_betas = torch.randn((32, 10))
+    gt_betas = torch.randn((32, 10))
+    has_smpl = torch.ones((32))
+    loss = model.compute_smpl_betas_loss(pred_betas, gt_betas, has_smpl)
+    assert loss > 0
+
+
+def test_compute_camera_loss():
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_54',
+        loss_camera=dict(type='CameraPriorLoss', loss_weight=60),
+    )
+
+    pred_cameras = torch.randn((32, 3))
+    loss = model.compute_camera_loss(pred_cameras)
+    assert loss > 0
+
+
+def test_compute_losses():
+    N = 32
+    predictions = {}
+    predictions['pred_shape'] = torch.randn(N, 10)
+    predictions['pred_pose'] = torch.randn(N, 24, 3, 3)
+    predictions['pred_cam'] = torch.randn(N, 3)
+
+    targets = {}
+    targets['keypoints3d'] = torch.randn(N, 45, 4)
+    targets['keypoints2d'] = torch.randn(N, 45, 3)
+    targets['has_smpl'] = torch.ones(N)
+    targets['smpl_body_pose'] = torch.randn(N, 23, 3)
+    targets['smpl_global_orient'] = torch.randn(N, 3)
+    targets['smpl_betas'] = torch.randn(N, 10)
+    targets['img_h'] = torch.ones(N, 1) * 256
+    targets['img_w'] = torch.ones(N, 1) * 192
+    targets['center'] = torch.randn(N, 2)
+    targets['scale'] = torch.randn(N, 1)
+    targets['focal_length'] = torch.randn(N, 1)
+    targets['crop_trans'] = torch.randn(N, 2, 3)
+
+    model = CliffImageBodyModelEstimator(convention='smpl_54')
+    loss = model.compute_losses(predictions, targets)
+    assert loss == {}
+
+    model = CliffImageBodyModelEstimator(
+        convention='smpl_45',
+        body_model_train=dict(
+            type='SMPL',
+            keypoint_src='smpl_45',
+            keypoint_dst='smpl_45',
+            model_path='data/body_models/smpl'),
+        loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100),
+        loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10),
+        loss_vertex=dict(type='L1Loss', loss_weight=2),
+        loss_smpl_pose=dict(type='MSELoss', loss_weight=3),
+        loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02),
+        loss_camera=dict(type='CameraPriorLoss', loss_weight=60))
+
+    loss = model.compute_losses(predictions, targets)
+    assert 'keypoints3d_loss' in loss
+    assert 'keypoints2d_loss' in loss
+    assert 'vertex_loss' in loss
+    assert 'smpl_pose_loss' in loss
+    assert 'smpl_betas_loss' in loss
+    assert 'camera_loss' in loss
+
+
+def test_run_registration():
+    batch_size = 2
+    body_model = dict(
+        type='SMPL',
+        keypoint_src='smpl_54',
+        keypoint_dst='smpl_49',
+        keypoint_approximate=True,
+        model_path='data/body_models/smpl',
+        extra_joints_regressor='data/body_models/J_regressor_extra.npy',
+        batch_size=batch_size)
+
+    camera = build_cameras(
+        dict(
+            type='PerspectiveCameras',
+            convention='opencv',
+            in_ndc=False,
+            focal_length=5000,
+            image_size=(224, 224),
+            principal_point=(112, 112)))
+
+    registrant = dict(
+        type='SMPLify',
+        body_model=body_model,
+        num_epochs=1,
+        stages=[
+            dict(
+                num_iter=1,
+                fit_global_orient=True,
+                fit_transl=True,
+                fit_body_pose=False,
+                fit_betas=False)
+        ],
+        optimizer=dict(type='Adam', lr=1e-2, betas=(0.9, 0.999)),
+        keypoints2d_loss=dict(
+            type='KeypointMSELoss',
+            loss_weight=1.0,
+            reduction='sum',
+            sigma=100),
+        device=torch.device('cpu'),
+        camera=camera)
+
+    registration = dict(mode='in_the_loop', registrant=registrant)
+
+    model = CliffImageBodyModelEstimator(
+        body_model_train=body_model, registration=registration)
+    assert model.registrant is not None
+    assert model.fits_dict is not None
+
+    transl = torch.Tensor([0, 0, 1]).view(1, 3).expand(batch_size, -1)
+
+    predictions = dict(
+        pred_pose=torch.zeros((batch_size, 24, 3, 3)),
+        pred_shape=torch.zeros((batch_size, 10)),
+        pred_cam=transl,
+    )
+
+    # generate 2D keypoints
+    smpl = build_body_model(body_model)
+    keypoints3d = smpl(transl=transl)['joints'].detach()
+    keypoints2d_xyd = camera.transform_points_screen(keypoints3d)
+    keypoints2d = keypoints2d_xyd[..., :2]
+    keypoints2d_conf = torch.ones(*keypoints2d.shape[:2], 1)
+    keypoints2d = torch.cat([keypoints2d, keypoints2d_conf], dim=-1)
+
+    targets = dict(
+        img_metas=[dict(dataset_name='coco'),
+                   dict(dataset_name='h36m')],
+        sample_idx=torch.zeros((batch_size, 1), dtype=torch.int),
+        is_flipped=torch.tensor([0, 1], dtype=torch.int),
+        rotation=torch.tensor([0.0, 0.1]),
+        smpl_betas=torch.zeros((batch_size, 10)),
+        smpl_global_orient=torch.zeros((batch_size, 3)),
+        smpl_body_pose=torch.zeros((batch_size, 69)),
+        keypoints2d=keypoints2d,
+        has_smpl=torch.tensor([0, 1], dtype=torch.int))
+
+    model.run_registration(predictions=predictions, targets=targets)
+    assert 'valid_fit' in targets
+    assert 'opt_vertices' in targets
+    assert 'opt_cam_t' in targets
+    assert 'opt_joints' in targets
+    assert 'opt_pose' in targets
+    assert 'opt_betas' in targets
diff --git a/tests/test_models/test_heads/test_cliff_head.py b/tests/test_models/test_heads/test_cliff_head.py
new file mode 100644
index 00000000..691add98
--- /dev/null
+++ b/tests/test_models/test_heads/test_cliff_head.py
@@ -0,0 +1,59 @@
+import numpy as np
+import pytest
+import torch
+
+from mmhuman3d.models.heads.builder import CliffHead
+
+
+def test_cliff_head():
+    # initialize models
+    model = CliffHead(
+        feat_dim=2048,
+        smpl_mean_params='data/body_models/smpl_mean_params.npz')
+
+    # image feature from backbone
+    batch_size = 32
+    bbox_info = [-0.5, 0.2, 1.5]
+    bbox_info = torch.FloatTensor([bbox_info] * batch_size)
+    x0_shape = (batch_size, 2048, 7, 7)
+    x0 = _demo_head_inputs(x0_shape)
+    x0 = torch.tensor(x0).float()
+    y0 = model(x0, bbox_info)
+    assert y0['pred_pose'].shape == (batch_size, 24, 3, 3)
+    assert y0['pred_shape'].shape == (batch_size, 10)
+    assert y0['pred_cam'].shape == (batch_size, 3)
+
+    # image feature from multi-layer backbone
+    x1_1_shape = (batch_size, 1024, 14, 14)
+    x1_2_shape = (batch_size, 2048, 7, 7)
+    x1 = [_demo_head_inputs(x1_1_shape), _demo_head_inputs(x1_2_shape)]
+    y1 = model(x1, bbox_info)
+    assert y1['pred_pose'].shape == (batch_size, 24, 3, 3)
+    assert y1['pred_shape'].shape == (batch_size, 10)
+    assert y1['pred_cam'].shape == (batch_size, 3)
+
+    # test temporal feature
+    T = 16
+    x_temp_shape = (batch_size, T, 1024)
+    x_temp = _demo_head_inputs(x_temp_shape)
+    with pytest.raises(NotImplementedError):
+        model(x_temp, bbox_info)
+
+    # test other cases
+    model_wo_smpl_mean_params = CliffHead(feat_dim=2048)
+    assert model_wo_smpl_mean_params.init_pose.shape == (1, 144)
+    assert model_wo_smpl_mean_params.init_shape.shape == (1, 10)
+    assert model_wo_smpl_mean_params.init_cam.shape == (1, 3)
+
+
+def _demo_head_inputs(input_shape=(1, 3, 64, 64)):
+    """Create a superset of inputs needed to run models.
+
+    Args:
+        input_shape (tuple): input batch dimensions.
+            Default: (1, 3, 64, 64).
+    """
+    features = np.random.random(input_shape)
+    features = torch.FloatTensor(features)
+
+    return features
diff --git a/tools/convert_datasets.py b/tools/convert_datasets.py
index 83d50ffd..bd06fbcd 100644
--- a/tools/convert_datasets.py
+++ b/tools/convert_datasets.py
@@ -57,7 +57,7 @@
     gta_human=dict(type='GTAHumanConverter', prefix='gta_human'),
     humman=dict(
         type='HuMManConverter', modes=['train', 'test'], prefix='humman'),
-)
+    cliff=dict(type='CliffConverter', modes=['coco', 'mpii']))
 
 
 def parse_args():