From 7996ee5b90735a6e620a741f3b2f8f2722bce7a6 Mon Sep 17 00:00:00 2001 From: oneScotch <71915686+oneScotch@users.noreply.github.com> Date: Wed, 5 Apr 2023 06:26:19 +0200 Subject: [PATCH] [Add] CLIFF (#302) * add cliff head * add function to convert from crop to full camera * add cliff annotation datasets converter * add tramsforms to get bbox information * store crop trans * cliff mesh estimator * modification to take in different resolutions * add configs * add missing comma * format correction * isort formating * correct error in cliff_head * revert unnecessary changes in cliff_head * add configs(single dataset) and small modification * configs format modification * add test for cliff head * format correction * update test file * format correction * update test file * format correction * update test file * format correction * docformatter correction * update test file * format * add README * add README * add test for cliff data converter * add test for cliff mesh estimator * update tests * merge cliff mesh estimator to mesh estimator * revert unnecessary tests * format * Revert to CliffMeshEstimator * Fix wrong class name in test * Fix linter * Fix bugs for test architecture * Fix test_data_converters.py * Update download links * Update pytorch3d install in workflow * Format * Add additional tests * Update to ubuntu-20.04 * Update to ubuntu-20.04 * Fix pickle * Fix setup.cfg * Fix setup.cfg * Change pickle5 to pickle * Fix pandas version --------- Co-authored-by: caizhongang Co-authored-by: caizhongang --- .github/workflows/build.yml | 4 +- .github/workflows/lint.yml | 2 +- configs/cliff/README.md | 81 ++ configs/cliff/coco.py | 189 ++++ configs/cliff/resnet50_pw3d_cache.py | 225 +++++ configs/cliff/resume.py | 228 +++++ mmhuman3d/data/data_converters/__init__.py | 3 +- mmhuman3d/data/data_converters/cliff.py | 121 +++ mmhuman3d/data/datasets/pipelines/__init__.py | 35 +- .../data/datasets/pipelines/transforms.py | 40 +- mmhuman3d/models/architectures/builder.py | 3 + .../architectures/cliff_mesh_estimator.py | 881 ++++++++++++++++++ mmhuman3d/models/heads/builder.py | 2 + mmhuman3d/models/heads/cliff_head.py | 98 ++ mmhuman3d/utils/geometry.py | 21 + requirements/runtime.txt | 2 +- setup.cfg | 2 +- tests/test_data_converters.py | 8 + tests/test_datasets/test_pipelines.py | 15 + .../test_cliff_mesh_estimator.py | 417 +++++++++ .../test_models/test_heads/test_cliff_head.py | 59 ++ tools/convert_datasets.py | 2 +- 22 files changed, 2402 insertions(+), 36 deletions(-) create mode 100644 configs/cliff/README.md create mode 100644 configs/cliff/coco.py create mode 100644 configs/cliff/resnet50_pw3d_cache.py create mode 100644 configs/cliff/resume.py create mode 100644 mmhuman3d/data/data_converters/cliff.py create mode 100644 mmhuman3d/models/architectures/cliff_mesh_estimator.py create mode 100644 mmhuman3d/models/heads/cliff_head.py create mode 100644 tests/test_models/test_architectures/test_cliff_mesh_estimator.py create mode 100644 tests/test_models/test_heads/test_cliff_head.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7fd2b25f..76a50c2c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -26,7 +26,7 @@ concurrency: jobs: build_cuda101: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 strategy: matrix: python-version: [3.8] @@ -69,7 +69,7 @@ jobs: - name: Install pytorch3d run: | conda install -c fvcore -c iopath -c conda-forge fvcore iopath -y - conda install pytorch3d -c pytorch3d + pip install "git+https://github.com/facebookresearch/pytorch3d.git" - name: Install MMCV run: | pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/cpu/torch${{matrix.torch}}/index.html diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 822bdcd9..2208335d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -8,7 +8,7 @@ concurrency: jobs: lint: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 diff --git a/configs/cliff/README.md b/configs/cliff/README.md new file mode 100644 index 00000000..8f536fa6 --- /dev/null +++ b/configs/cliff/README.md @@ -0,0 +1,81 @@ +# CLIFF + +## Introduction + +We provide the config files for CLIFF: [CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation](https://arxiv.org/pdf/2208.00571.pdf). + +```BibTeX + +@Inproceedings{li2022cliff, + author = {Li, Zhihao and + Liu, Jianzhuang and + Zhang, Zhensong and + Xu, Songcen and + Yan, Youliang}, + title = {CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation}, + booktitle = {ECCV}, + year = {2022} +} + +``` + +## Notes + +- [SMPL](https://smpl.is.tue.mpg.de/) v1.0 is used in our experiments. +- [J_regressor_extra.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/J_regressor_extra.npy?versionId=CAEQHhiBgIDD6c3V6xciIGIwZDEzYWI5NTBlOTRkODU4OTE1M2Y4YTI0NTVlZGM1) +- [J_regressor_h36m.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/J_regressor_h36m.npy?versionId=CAEQHhiBgIDE6c3V6xciIDdjYzE3MzQ4MmU4MzQyNmRiZDA5YTg2YTI5YWFkNjRi) +- [pascal_occluders.npy](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/pare/pascal_occluders.npy?versionId=CAEQOhiBgMCH2fqigxgiIDY0YzRiNThkMjU1MzRjZTliMTBhZmFmYWY0MTViMTIx) +- [resnet50_a1h2_176-001a1197.pth](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1h2_176-001a1197.pth) +- [resnet50_a1h2_176-001a1197.pth(alternative download link)](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/resnet50_a1h2_176-001a1197.pth) + +Download the above resources and arrange them in the following file structure: + +```text +mmhuman3d +├── mmhuman3d +├── docs +├── tests +├── tools +├── configs +└── data + ├── checkpoints + │ ├── resnet50_a1h2_176-001a1197.pth + ├── body_models + │ ├── J_regressor_extra.npy + │ ├── J_regressor_h36m.npy + │ ├── smpl_mean_params.npz + │ └── smpl + │ ├── SMPL_FEMALE.pkl + │ ├── SMPL_MALE.pkl + │ └── SMPL_NEUTRAL.pkl + ├── preprocessed_datasets + │ ├── cliff_coco_train.npz + │ ├── cliff_mpii_train.npz + │ ├── h36m_mosh_train.npz + │ ├── muco3dhp_train.npz + │ ├── mpi_inf_3dhp_train.npz + │ └── pw3d_test.npz + ├── occluders + │ ├── pascal_occluders.npy + └── datasets + ├── coco + ├── h36m + ├── muco + ├── mpi_inf_3dhp + ├── mpii + └── pw3d +``` + +## Training +Stage 1: First use [resnet50_pw3d_cache.py](resnet50_pw3d_cache.py) to train. + +Stage 2: After around 150 epoches, switch to [resume.py](resume.py) by using "--resume-from" optional argument. + +## Results and Models + +We evaluate HMR on 3DPW. Values are MPJPE/PA-MPJPE. + +| Config | 3DPW | Download | +|:---------------------------------------------------------:|:-------------:|:------:| +| Stage 1: [resnet50_pw3d_cache.py](resnet50_pw3d_cache.py) | 48.65 / 76.49 | [model](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/resnet50_cliff-8328e2e2_20230327.pth) | [log](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/20220909_142945.log) +| Stage 2: [resnet50_pw3d_cache.py](resnet50_pw3d_cache.py) | 47.38 / 75.08 | [model](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/resnet50_cliff_new-1e639f1d_20230327.pth) | [log](https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/cliff/20230222_092227.log) diff --git a/configs/cliff/coco.py b/configs/cliff/coco.py new file mode 100644 index 00000000..6651909e --- /dev/null +++ b/configs/cliff/coco.py @@ -0,0 +1,189 @@ +_base_ = ['../_base_/default_runtime.py'] +use_adversarial_train = True + +# evaluate +evaluation = dict(metric=['pa-mpjpe', 'mpjpe']) +# optimizer +optimizer = dict( + backbone=dict(type='Adam', lr=1e-4), + head=dict(type='Adam', lr=1e-4), + # disc=dict(type='Adam', lr=1e-4) +) +optimizer_config = dict(grad_clip=2.0) +# learning policy +lr_config = dict(policy='Fixed', by_epoch=False) +runner = dict(type='EpochBasedRunner', max_epochs=800) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +img_resolution = (192, 256) + +# model settings +model = dict( + type='CliffImageBodyModelEstimator', + backbone=dict( + type='ResNet', + depth=50, + out_indices=[3], + norm_eval=False, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict( + type='Pretrained', + checkpoint='data/checkpoints/resnet50_a1h2_176-001a1197.pth')), + head=dict( + type='CliffHead', + feat_dim=2048, + smpl_mean_params='data/body_models/smpl_mean_params.npz'), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_54', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_54', + loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100), + loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10), + loss_vertex=dict(type='L1Loss', loss_weight=2), + loss_smpl_pose=dict(type='MSELoss', loss_weight=3), + loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02), + loss_adv=dict( + type='GANLoss', + gan_type='lsgan', + real_label_val=1.0, + fake_label_val=0.0, + loss_weight=1), + # disc=dict(type='SMPLDiscriminator') +) +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', + 'smpl_body_pose', + 'smpl_global_orient', + 'smpl_betas', + 'smpl_transl', + 'keypoints2d', + 'keypoints3d', + 'sample_idx', + 'img_h', # extras for cliff + 'img_w', + 'focal_length', + 'center', + 'scale', + 'bbox_info', + 'crop_trans', + 'inv_trans' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_54'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='GetBboxInfo'), + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] +adv_data_keys = [ + 'smpl_body_pose', 'smpl_global_orient', 'smpl_betas', 'smpl_transl' +] +train_adv_pipeline = [dict(type='Collect', keys=adv_data_keys, meta_keys=[])] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='GetBboxInfo'), + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=[ + 'image_path', 'center', 'scale', 'rotation', 'img_h', 'img_w', + 'bbox_info' + ]) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +cache_files = { + 'cliff_coco': 'data/cache/cliff_coco_train_smpl_54.npz', +} +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='AdversarialDataset', + train_dataset=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['cliff_coco'], + ann_file='cliff_coco_train.npz'), + ], + partition=[1.0], + ), + adv_dataset=dict( + type='MeshDataset', + dataset_name='cmu_mosh', + data_prefix='data', + pipeline=train_adv_pipeline, + ann_file='cmu_mosh.npz')), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/cliff/resnet50_pw3d_cache.py b/configs/cliff/resnet50_pw3d_cache.py new file mode 100644 index 00000000..d0d8becd --- /dev/null +++ b/configs/cliff/resnet50_pw3d_cache.py @@ -0,0 +1,225 @@ +_base_ = ['../_base_/default_runtime.py'] +use_adversarial_train = True + +# evaluate +evaluation = dict(metric=['pa-mpjpe', 'mpjpe']) +# optimizer +optimizer = dict( + backbone=dict(type='Adam', lr=3e-4), + head=dict(type='Adam', lr=3e-4), + # disc=dict(type='Adam', lr=1e-4) +) +optimizer_config = dict(grad_clip=2.0) +# learning policy +lr_config = dict(policy='step', gamma=0.1, step=[100]) +runner = dict(type='EpochBasedRunner', max_epochs=250) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +img_resolution = (192, 256) + +# model settings +model = dict( + type='CliffImageBodyModelEstimator', + backbone=dict( + type='ResNet', + depth=50, + out_indices=[3], + norm_eval=False, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict( + type='Pretrained', + checkpoint='data/checkpoints/resnet50_a1h2_176-001a1197.pth')), + head=dict( + type='CliffHead', + feat_dim=2048, + smpl_mean_params='data/body_models/smpl_mean_params.npz'), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_54', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_54', + loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100), + loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10), + loss_vertex=dict(type='L1Loss', loss_weight=2), + loss_smpl_pose=dict(type='MSELoss', loss_weight=3), + loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02), + loss_adv=dict( + type='GANLoss', + gan_type='lsgan', + real_label_val=1.0, + fake_label_val=0.0, + loss_weight=1), + # disc=dict(type='SMPLDiscriminator') +) +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', + 'smpl_body_pose', + 'smpl_global_orient', + 'smpl_betas', + 'smpl_transl', + 'keypoints2d', + 'keypoints3d', + 'sample_idx', + 'img_h', # extras for cliff + 'img_w', + 'focal_length', + 'center', + 'scale', + 'bbox_info', + 'crop_trans', + 'inv_trans' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_54'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='GetBboxInfo'), + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] +adv_data_keys = [ + 'smpl_body_pose', 'smpl_global_orient', 'smpl_betas', 'smpl_transl' +] +train_adv_pipeline = [dict(type='Collect', keys=adv_data_keys, meta_keys=[])] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='GetBboxInfo'), + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=[ + 'image_path', 'center', 'scale', 'rotation', 'img_h', 'img_w', + 'bbox_info' + ]) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +cache_files = { + 'h36m': 'data/cache/h36m_mosh_train_smpl_54.npz', + 'mpi_inf_3dhp': 'data/cache/mpi_inf_3dhp_train_smpl_54.npz', + 'cliff_coco': 'data/cache/cliff_coco_train_smpl_54.npz', + 'cliff_mpii': 'data/cache/cliff_mpii_train_smpl_54.npz', + 'pw3d': 'data/cache/pw3d_train_smpl_54.npz', +} +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='AdversarialDataset', + train_dataset=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='h36m', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['h36m'], + ann_file='h36m_mosh_train.npz'), + dict( + type=dataset_type, + dataset_name='mpi_inf_3dhp', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['mpi_inf_3dhp'], + ann_file='mpi_inf_3dhp_train.npz'), + dict( + type=dataset_type, + dataset_name='mpii', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['cliff_mpii'], + ann_file='cliff_mpii_train.npz'), + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['cliff_coco'], + ann_file='cliff_coco_train.npz'), + dict( + type=dataset_type, + dataset_name='pw3d', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['pw3d'], + ann_file='pw3d_train.npz'), + ], + partition=[0.4, 0.1, 0.1, 0.2, 0.2], + ), + adv_dataset=dict( + type='MeshDataset', + dataset_name='cmu_mosh', + data_prefix='data', + pipeline=train_adv_pipeline, + ann_file='cmu_mosh.npz')), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/configs/cliff/resume.py b/configs/cliff/resume.py new file mode 100644 index 00000000..652de149 --- /dev/null +++ b/configs/cliff/resume.py @@ -0,0 +1,228 @@ +_base_ = ['../_base_/default_runtime.py'] +use_adversarial_train = True + +# evaluate +evaluation = dict(metric=['pa-mpjpe', 'mpjpe']) +# optimizer +optimizer = dict( + backbone=dict(type='Adam', lr=3e-4), + head=dict(type='Adam', lr=3e-4), + # disc=dict(type='Adam', lr=1e-4) +) +optimizer_config = dict(grad_clip=2.0) +# learning policy +lr_config = dict(policy='step', gamma=0.1, step=[100]) +runner = dict(type='EpochBasedRunner', max_epochs=160) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +img_resolution = (192, 256) + +# model settings +model = dict( + type='CliffImageBodyModelEstimator', + backbone=dict( + type='ResNet', + depth=50, + out_indices=[3], + norm_eval=False, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict( + type='Pretrained', + checkpoint='data/checkpoints/resnet50_a1h2_176-001a1197.pth')), + head=dict( + type='CliffHead', + feat_dim=2048, + smpl_mean_params='data/body_models/smpl_mean_params.npz'), + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_54', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy'), + body_model_test=dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + convention='smpl_54', + loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100), + loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10), + loss_vertex=dict(type='L1Loss', loss_weight=2), + loss_smpl_pose=dict(type='MSELoss', loss_weight=3), + loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02), + loss_adv=dict( + type='GANLoss', + gan_type='lsgan', + real_label_val=1.0, + fake_label_val=0.0, + loss_weight=1), + # disc=dict(type='SMPLDiscriminator') +) +# dataset settings +dataset_type = 'HumanImageDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data_keys = [ + 'has_smpl', + 'smpl_body_pose', + 'smpl_global_orient', + 'smpl_betas', + 'smpl_transl', + 'keypoints2d', + 'keypoints3d', + 'sample_idx', + 'img_h', # extras for cliff + 'img_w', + 'focal_length', + 'center', + 'scale', + 'bbox_info', + 'crop_trans', + 'inv_trans' +] +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomChannelNoise', noise_factor=0.4), + dict( + type='SyntheticOcclusion', + occluders_file='data/occluders/pascal_occluders.npy'), + dict(type='RandomHorizontalFlip', flip_prob=0.5, convention='smpl_54'), + dict(type='GetRandomScaleRotation', rot_factor=30, scale_factor=0.25), + dict(type='GetBboxInfo'), + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] +adv_data_keys = [ + 'smpl_body_pose', 'smpl_global_orient', 'smpl_betas', 'smpl_transl' +] +train_adv_pipeline = [dict(type='Collect', keys=adv_data_keys, meta_keys=[])] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='GetRandomScaleRotation', rot_factor=0, scale_factor=0), + dict(type='GetBboxInfo'), + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=data_keys), + dict( + type='Collect', + keys=['img', *data_keys], + meta_keys=[ + 'image_path', 'center', 'scale', 'rotation', 'img_h', 'img_w', + 'bbox_info' + ]) +] + +inference_pipeline = [ + dict(type='MeshAffine', img_res=img_resolution), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img', 'sample_idx'], + meta_keys=['image_path', 'center', 'scale', 'rotation']) +] + +cache_files = { + 'h36m': 'data/cache/h36m_mosh_train_smpl_54.npz', + 'muco': 'data/cache/muco3dhp_train.npz', + 'cliff_coco': 'data/cache/cliff_coco_train_smpl_54.npz', + 'cliff_mpii': 'data/cache/cliff_mpii_train_smpl_54.npz', + 'pw3d': 'data/cache/pw3d_train_smpl_54.npz', +} +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='AdversarialDataset', + train_dataset=dict( + type='MixedDataset', + configs=[ + dict( + type=dataset_type, + dataset_name='h36m', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['h36m'], + ann_file='h36m_mosh_train.npz'), + dict( + type=dataset_type, + dataset_name='muco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['muco'], + ann_file='muco3dhp_train.npz'), + dict( + type=dataset_type, + dataset_name='mpii', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['cliff_mpii'], + ann_file='cliff_mpii_train.npz'), + dict( + type=dataset_type, + dataset_name='coco', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['cliff_coco'], + ann_file='cliff_coco_train.npz'), + dict( + type=dataset_type, + dataset_name='pw3d', + data_prefix='data', + pipeline=train_pipeline, + convention='smpl_54', + cache_data_path=cache_files['pw3d'], + ann_file='pw3d_train.npz'), + ], + partition=[0.4, 0.1, 0.1, 0.2, 0.2], + ), + adv_dataset=dict( + type='MeshDataset', + dataset_name='cmu_mosh', + data_prefix='data', + pipeline=train_adv_pipeline, + ann_file='cmu_mosh.npz')), + val=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), + test=dict( + type=dataset_type, + body_model=dict( + type='GenderedSMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy'), + dataset_name='pw3d', + data_prefix='data', + pipeline=test_pipeline, + ann_file='pw3d_test.npz'), +) diff --git a/mmhuman3d/data/data_converters/__init__.py b/mmhuman3d/data/data_converters/__init__.py index c056f926..7471bfa4 100644 --- a/mmhuman3d/data/data_converters/__init__.py +++ b/mmhuman3d/data/data_converters/__init__.py @@ -1,6 +1,7 @@ from .agora import AgoraConverter from .amass import AmassConverter from .builder import build_data_converter +from .cliff import CliffConverter from .coco import CocoConverter from .coco_hybrik import CocoHybrIKConverter from .coco_wholebody import CocoWholebodyConverter @@ -43,5 +44,5 @@ 'SurrealConverter', 'InstaVibeConverter', 'SpinConverter', 'VibeConverter', 'HuMManConverter', 'FFHQFlameConverter', 'ExposeCuratedFitsConverter', 'ExposeSPINSMPLXConverter', 'FreihandConverter', 'StirlingConverter', - 'EHFConverter' + 'EHFConverter', 'CliffConverter' ] diff --git a/mmhuman3d/data/data_converters/cliff.py b/mmhuman3d/data/data_converters/cliff.py new file mode 100644 index 00000000..e34e897f --- /dev/null +++ b/mmhuman3d/data/data_converters/cliff.py @@ -0,0 +1,121 @@ +import os +from typing import List + +import numpy as np + +from mmhuman3d.core.conventions.keypoints_mapping import convert_kps +from mmhuman3d.data.data_structures.human_data import HumanData +from mmhuman3d.data.data_structures.multi_human_data import MultiHumanData +from .base_converter import BaseModeConverter +from .builder import DATA_CONVERTERS + + +@DATA_CONVERTERS.register_module() +class CliffConverter(BaseModeConverter): + """CLIFF datasets converter `Carrying Location Information in Full Frames + into Human Pose and Shape Estimation' More details can be found in the + `paper. + + `__. + Args: + modes (list): 'coco', 'mpii' + for accepted modes + """ + + ACCEPTED_MODES = ['coco', 'mpii'] + + def __init__(self, modes: List = []) -> None: + super(CliffConverter, self).__init__(modes) + + # def __init__(self) -> None: + self.mapping_dict = { + 'coco': 'coco2014part_cliffGT.npz', + 'mpii': 'mpii_cliffGT.npz', + } + + def convert_by_mode(self, + dataset_path: str, + out_path: str, + mode: str, + enable_multi_human_data: bool = False) -> dict: + """ + Args: + dataset_path (str): Path to directory where spin preprocessed + npz files are stored + out_path (str): Path to directory to save preprocessed npz file + mode (str): Mode in accepted modes + enable_multi_human_data (bool): + Whether to generate a multi-human data. If set to True, + stored in MultiHumanData() format. + Default: False, stored in HumanData() format. + + Returns: + dict: + A dict containing keys image_path, bbox_xywh, keypoints2d, + keypoints2d_mask,stored in HumanData() format. keypoints3d, + keypoints3d_mask, smpl are added if available. + + """ + if enable_multi_human_data: + # use MultiHumanData to store all data + human_data = MultiHumanData() + else: + # use HumanData to store all data + human_data = HumanData() + + image_path_, keypoints2d_, bbox_xywh_ = [], [], [] + + if mode in self.mapping_dict.keys(): + seq_file = self.mapping_dict[mode] + seq_path = os.path.join(dataset_path, seq_file) + + data = np.load(seq_path) + + keypoints2d_ = data['part'] + image_path_ = data['imgname'] + + # center scale to bbox + w = h = data['scale'] * 200 + x = data['center'][:, 0] - w / 2 + y = data['center'][:, 1] - h / 2 + + bbox_xywh_ = np.column_stack((x, y, w, h)) + + # convert keypoints + bbox_xywh_ = np.array(bbox_xywh_).reshape((-1, 4)) + bbox_xywh_ = np.hstack([bbox_xywh_, np.ones([bbox_xywh_.shape[0], 1])]) + keypoints2d_ = np.array(keypoints2d_).reshape((-1, 24, 3)) + keypoints2d_, keypoints2d_mask = convert_kps(keypoints2d_, 'smpl_24', + 'human_data') + + if 'S' in data: + keypoints3d_ = data['S'] + keypoints3d_ = np.array(keypoints3d_).reshape((-1, 24, 4)) + keypoints3d_, keypoints3d_mask = convert_kps( + keypoints3d_, 'smpl_24', 'human_data') + human_data['keypoints3d_mask'] = keypoints3d_mask + human_data['keypoints3d'] = keypoints3d_ + + if 'has_smpl' in data: + has_smpl = data['has_smpl'] + smpl = {} + smpl['body_pose'] = np.array(data['pose'][:, 3:]).reshape( + (-1, 23, 3)) + smpl['global_orient'] = np.array(data['pose'][:, :3]).reshape( + (-1, 3)) + smpl['betas'] = np.array(data['shape']).reshape((-1, 10)) + human_data['smpl'] = smpl + human_data['has_smpl'] = has_smpl + + human_data['image_path'] = image_path_.tolist() + human_data['bbox_xywh'] = bbox_xywh_ + human_data['keypoints2d_mask'] = keypoints2d_mask + human_data['keypoints2d'] = keypoints2d_ + human_data['config'] = mode + human_data.compress_keypoints_by_mask() + + # store the data struct + if not os.path.isdir(out_path): + os.makedirs(out_path) + out_file = os.path.join(out_path, f'cliff_{mode}_train.npz') + human_data.dump(out_file) diff --git a/mmhuman3d/data/datasets/pipelines/__init__.py b/mmhuman3d/data/datasets/pipelines/__init__.py index ae6d9dbe..6551ed63 100644 --- a/mmhuman3d/data/datasets/pipelines/__init__.py +++ b/mmhuman3d/data/datasets/pipelines/__init__.py @@ -22,6 +22,7 @@ BBoxCenterJitter, CenterCrop, ColorJitter, + GetBboxInfo, GetRandomScaleRotation, Lighting, MeshAffine, @@ -33,31 +34,11 @@ ) __all__ = [ - 'Compose', - 'to_tensor', - 'ToTensor', - 'ImageToTensor', - 'ToPIL', - 'ToNumpy', - 'Transpose', - 'Collect', - 'LoadImageFromFile', - 'CenterCrop', - 'RandomHorizontalFlip', - 'ColorJitter', - 'Lighting', - 'RandomChannelNoise', - 'GetRandomScaleRotation', - 'MeshAffine', - 'HybrIKRandomFlip', - 'HybrIKAffine', - 'GenerateHybrIKTarget', - 'RandomDPG', - 'RandomOcclusion', - 'Rotation', - 'NewKeypointsSelection', - 'Normalize', - 'SyntheticOcclusion', - 'BBoxCenterJitter', - 'SimulateLowRes', + 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToPIL', 'ToNumpy', + 'Transpose', 'Collect', 'LoadImageFromFile', 'CenterCrop', + 'RandomHorizontalFlip', 'ColorJitter', 'Lighting', 'RandomChannelNoise', + 'GetRandomScaleRotation', 'MeshAffine', 'HybrIKRandomFlip', 'HybrIKAffine', + 'GenerateHybrIKTarget', 'RandomDPG', 'RandomOcclusion', 'Rotation', + 'NewKeypointsSelection', 'Normalize', 'SyntheticOcclusion', + 'BBoxCenterJitter', 'SimulateLowRes', 'GetBboxInfo' ] diff --git a/mmhuman3d/data/datasets/pipelines/transforms.py b/mmhuman3d/data/datasets/pipelines/transforms.py index 078bcd38..da2f42f9 100644 --- a/mmhuman3d/data/datasets/pipelines/transforms.py +++ b/mmhuman3d/data/datasets/pipelines/transforms.py @@ -746,14 +746,18 @@ class MeshAffine: """ def __init__(self, img_res): - self.img_res = img_res - self.image_size = np.array([img_res, img_res]) + if isinstance(img_res, tuple): + self.image_size = img_res + else: + self.image_size = np.array([img_res, img_res]) def __call__(self, results): c = results['center'] s = results['scale'] r = results['rotation'] trans = get_affine_transform(c, s, r, self.image_size) + inv_trans = get_affine_transform(c, s, 0., self.image_size, inv=True) + crop_trans = get_affine_transform(c, s, 0., self.image_size) if 'img' in results: img = results['img'] @@ -797,6 +801,8 @@ def __call__(self, results): global_orient = _rotate_smpl_pose(global_orient, r) results['smplx_global_orient'] = global_orient + results['crop_trans'] = crop_trans + results['inv_trans'] = inv_trans return results @@ -951,3 +957,33 @@ def __call__(self, results): results['img'] = img return results + + +@PIPELINES.register_module() +class GetBboxInfo: + """Get bbox for cliff.""" + + def estimate_focal_length(self, img_h, img_w): + return (img_w * img_w + img_h * img_h)**0.5 # fov: 55 degree + + def __call__(self, results): + """(1) Get focal length from original image (2) get bbox_info from c + and s.""" + img = results['img'] + img_h, img_w = img.shape[:2] + focal_length = self.estimate_focal_length(img_h, img_w) + + results['img_h'] = img_h + results['img_w'] = img_w + results['focal_length'] = focal_length + cx, cy = results['center'] + s = results['scale'][0] + + bbox_info = np.stack([cx - img_w / 2., cy - img_h / 2., s]) + bbox_info[:2] = bbox_info[:2] / focal_length * 2.8 # [-1, 1] + bbox_info[2] = (bbox_info[2] - 0.24 * focal_length) / ( + 0.06 * focal_length) # [-1, 1] + + results['bbox_info'] = np.float32(bbox_info) + + return results diff --git a/mmhuman3d/models/architectures/builder.py b/mmhuman3d/models/architectures/builder.py index 4e504d82..15ffbc01 100644 --- a/mmhuman3d/models/architectures/builder.py +++ b/mmhuman3d/models/architectures/builder.py @@ -3,6 +3,7 @@ from mmcv.cnn import MODELS as MMCV_MODELS from mmcv.utils import Registry +from .cliff_mesh_estimator import CliffImageBodyModelEstimator from .expressive_mesh_estimator import SMPLXImageBodyModelEstimator from .hybrik import HybrIK_trainer from .mesh_estimator import ImageBodyModelEstimator, VideoBodyModelEstimator @@ -25,6 +26,8 @@ def build_from_cfg(cfg, registry, default_args=None): name='VideoBodyModelEstimator', module=VideoBodyModelEstimator) ARCHITECTURES.register_module( name='SMPLXImageBodyModelEstimator', module=SMPLXImageBodyModelEstimator) +ARCHITECTURES.register_module( + name='CliffImageBodyModelEstimator', module=CliffImageBodyModelEstimator) ARCHITECTURES.register_module(name='PyMAFX', module=PyMAFX) diff --git a/mmhuman3d/models/architectures/cliff_mesh_estimator.py b/mmhuman3d/models/architectures/cliff_mesh_estimator.py new file mode 100644 index 00000000..cea36317 --- /dev/null +++ b/mmhuman3d/models/architectures/cliff_mesh_estimator.py @@ -0,0 +1,881 @@ +from abc import ABCMeta, abstractmethod +from typing import Optional, Tuple, Union + +import torch +import torch.nn.functional as F + +import mmhuman3d.core.visualization.visualize_smpl as visualize_smpl +from mmhuman3d.core.conventions.keypoints_mapping import get_keypoint_idx +from mmhuman3d.models.utils import FitsDict +from mmhuman3d.utils.geometry import ( + batch_rodrigues, + cam_crop2full, + estimate_translation, + perspective_projection, + project_points, + rotation_matrix_to_angle_axis, +) +from ..backbones.builder import build_backbone +from ..body_models.builder import build_body_model +from ..discriminators.builder import build_discriminator +from ..heads.builder import build_head +from ..losses.builder import build_loss +from ..necks.builder import build_neck +from ..registrants.builder import build_registrant +from .base_architecture import BaseArchitecture + + +def set_requires_grad(nets, requires_grad=False): + """Set requies_grad for all the networks. + + Args: + nets (nn.Module | list[nn.Module]): A list of networks or a single + network. + requires_grad (bool): Whether the networks require gradients or not + """ + if not isinstance(nets, list): + nets = [nets] + for net in nets: + if net is not None: + for param in net.parameters(): + param.requires_grad = requires_grad + + +class BodyModelEstimator(BaseArchitecture, metaclass=ABCMeta): + """BodyModelEstimator Architecture. + + Args: + backbone (dict | None, optional): Backbone config dict. Default: None. + neck (dict | None, optional): Neck config dict. Default: None + head (dict | None, optional): Regressor config dict. Default: None. + disc (dict | None, optional): Discriminator config dict. + Default: None. + registration (dict | None, optional): Registration config dict. + Default: None. + body_model_train (dict | None, optional): SMPL config dict during + training. Default: None. + body_model_test (dict | None, optional): SMPL config dict during + test. Default: None. + convention (str, optional): Keypoints convention. Default: "human_data" + loss_keypoints2d (dict | None, optional): Losses config dict for + 2D keypoints. Default: None. + loss_keypoints3d (dict | None, optional): Losses config dict for + 3D keypoints. Default: None. + loss_vertex (dict | None, optional): Losses config dict for mesh + vertices. Default: None + loss_smpl_pose (dict | None, optional): Losses config dict for smpl + pose. Default: None + loss_smpl_betas (dict | None, optional): Losses config dict for smpl + betas. Default: None + loss_camera (dict | None, optional): Losses config dict for predicted + camera parameters. Default: None + loss_adv (dict | None, optional): Losses config for adversial + training. Default: None. + loss_segm_mask (dict | None, optional): Losses config for predicted + part segmentation. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + backbone: Optional[Union[dict, None]] = None, + neck: Optional[Union[dict, None]] = None, + head: Optional[Union[dict, None]] = None, + disc: Optional[Union[dict, None]] = None, + registration: Optional[Union[dict, None]] = None, + body_model_train: Optional[Union[dict, None]] = None, + body_model_test: Optional[Union[dict, None]] = None, + convention: Optional[str] = 'human_data', + loss_keypoints2d: Optional[Union[dict, None]] = None, + loss_keypoints3d: Optional[Union[dict, None]] = None, + loss_vertex: Optional[Union[dict, None]] = None, + loss_smpl_pose: Optional[Union[dict, None]] = None, + loss_smpl_betas: Optional[Union[dict, None]] = None, + loss_camera: Optional[Union[dict, None]] = None, + loss_adv: Optional[Union[dict, None]] = None, + loss_segm_mask: Optional[Union[dict, None]] = None, + init_cfg: Optional[Union[list, dict, None]] = None): + super(BodyModelEstimator, self).__init__(init_cfg) + self.backbone = build_backbone(backbone) + self.neck = build_neck(neck) + self.head = build_head(head) + self.disc = build_discriminator(disc) + + self.body_model_train = build_body_model(body_model_train) + self.body_model_test = build_body_model(body_model_test) + self.convention = convention + + # TODO: support HMR+ + + self.registration = registration + if registration is not None: + self.fits_dict = FitsDict(fits='static') + self.registration_mode = self.registration['mode'] + self.registrant = build_registrant(registration['registrant']) + else: + self.registrant = None + + self.loss_keypoints2d = build_loss(loss_keypoints2d) + self.loss_keypoints3d = build_loss(loss_keypoints3d) + + self.loss_vertex = build_loss(loss_vertex) + self.loss_smpl_pose = build_loss(loss_smpl_pose) + self.loss_smpl_betas = build_loss(loss_smpl_betas) + self.loss_adv = build_loss(loss_adv) + self.loss_camera = build_loss(loss_camera) + self.loss_segm_mask = build_loss(loss_segm_mask) + set_requires_grad(self.body_model_train, False) + set_requires_grad(self.body_model_test, False) + + def train_step(self, data_batch, optimizer, **kwargs): + """Train step function. + + In this function, the detector will finish the train step following + the pipeline: + 1. get fake and real SMPL parameters + 2. optimize discriminator (if have) + 3. optimize generator + If `self.train_cfg.disc_step > 1`, the train step will contain multiple + iterations for optimizing discriminator with different input data and + only one iteration for optimizing generator after `disc_step` + iterations for discriminator. + Args: + data_batch (torch.Tensor): Batch of data as input. + optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for + generator and discriminator (if have). + Returns: + outputs (dict): Dict with loss, information for logger, + the number of samples. + """ + if self.backbone is not None: + img = data_batch['img'] + features = self.backbone(img) + else: + features = data_batch['features'] + + if self.neck is not None: + features = self.neck(features) + + # NOTE: features and bbox_info taken as input for Cliff + bbox_info = data_batch['bbox_info'] + predictions = self.head(features, bbox_info) + targets = self.prepare_targets(data_batch) + + # optimize discriminator (if have) + if self.disc is not None: + self.optimize_discrinimator(predictions, data_batch, optimizer) + + if self.registration is not None: + targets = self.run_registration(predictions, targets) + + losses = self.compute_losses(predictions, targets) + # optimizer generator part + if self.disc is not None: + adv_loss = self.optimize_generator(predictions) + losses.update(adv_loss) + + loss, log_vars = self._parse_losses(losses) + for key in optimizer.keys(): + optimizer[key].zero_grad() + loss.backward() + for key in optimizer.keys(): + optimizer[key].step() + + outputs = dict( + loss=loss, + log_vars=log_vars, + num_samples=len(next(iter(data_batch.values())))) + return outputs + + def run_registration( + self, + predictions: dict, + targets: dict, + threshold: Optional[float] = 10.0, + focal_length: Optional[float] = 5000.0, + img_res: Optional[Union[Tuple[int], int]] = 224) -> dict: + """Run registration on 2D keypoinst in predictions to obtain SMPL + parameters as pseudo ground truth. + + Args: + predictions (dict): predicted SMPL parameters are used for + initialization. + targets (dict): existing ground truths with 2D keypoints + threshold (float, optional): the threshold to update fits + dictionary. Default: 10.0. + focal_length (tuple(int) | int, optional): camera focal_length + img_res (int, optional): image resolution + + Returns: + targets: contains additional SMPL parameters + """ + + img_metas = targets['img_metas'] + dataset_name = [meta['dataset_name'] for meta in img_metas + ] # name of the dataset the image comes from + + indices = targets['sample_idx'].squeeze() + is_flipped = targets['is_flipped'].squeeze().bool( + ) # flag that indicates whether image was flipped + # during data augmentation + rot_angle = targets['rotation'].squeeze( + ) # rotation angle used for data augmentation Q + gt_betas = targets['smpl_betas'].float() + gt_global_orient = targets['smpl_global_orient'].float() + gt_pose = targets['smpl_body_pose'].float().view(-1, 69) + + pred_rotmat = predictions['pred_pose'].detach().clone() + pred_betas = predictions['pred_shape'].detach().clone() + pred_cam = predictions['pred_cam'].detach().clone() + pred_cam_t = torch.stack([ + pred_cam[:, 1], pred_cam[:, 2], 2 * focal_length / + (img_res * pred_cam[:, 0] + 1e-9) + ], + dim=-1) + + gt_keypoints_2d = targets['keypoints2d'].float() + num_keypoints = gt_keypoints_2d.shape[1] + + has_smpl = targets['has_smpl'].view( + -1).bool() # flag that indicates whether SMPL parameters are valid + batch_size = has_smpl.shape[0] + device = has_smpl.device + + # Get GT vertices and model joints + # Note that gt_model_joints is different from gt_joints as + # it comes from SMPL + gt_out = self.body_model_train( + betas=gt_betas, body_pose=gt_pose, global_orient=gt_global_orient) + # TODO: support more convention + assert num_keypoints == 49 + gt_model_joints = gt_out['joints'] + gt_vertices = gt_out['vertices'] + + # Get current best fits from the dictionary + opt_pose, opt_betas = self.fits_dict[(dataset_name, indices.cpu(), + rot_angle.cpu(), + is_flipped.cpu())] + + opt_pose = opt_pose.to(device) + opt_betas = opt_betas.to(device) + opt_output = self.body_model_train( + betas=opt_betas, + body_pose=opt_pose[:, 3:], + global_orient=opt_pose[:, :3]) + opt_joints = opt_output['joints'] + opt_vertices = opt_output['vertices'] + + gt_keypoints_2d_orig = gt_keypoints_2d.clone() + # Estimate camera translation given the model joints and 2D keypoints + # by minimizing a weighted least squares loss + gt_cam_t = estimate_translation( + gt_model_joints, + gt_keypoints_2d_orig, + focal_length=focal_length, + img_size=img_res) + + opt_cam_t = estimate_translation( + opt_joints, + gt_keypoints_2d_orig, + focal_length=focal_length, + img_size=img_res) + + with torch.no_grad(): + loss_dict = self.registrant.evaluate( + global_orient=opt_pose[:, :3], + body_pose=opt_pose[:, 3:], + betas=opt_betas, + transl=opt_cam_t, + keypoints2d=gt_keypoints_2d_orig[:, :, :2], + keypoints2d_conf=gt_keypoints_2d_orig[:, :, 2], + reduction_override='none') + opt_joint_loss = loss_dict['keypoint2d_loss'].sum(dim=-1).sum(dim=-1) + + if self.registration_mode == 'in_the_loop': + # Convert predicted rotation matrices to axis-angle + pred_rotmat_hom = torch.cat([ + pred_rotmat.detach().view(-1, 3, 3).detach(), + torch.tensor([0, 0, 1], dtype=torch.float32, + device=device).view(1, 3, 1).expand( + batch_size * 24, -1, -1) + ], + dim=-1) + pred_pose = rotation_matrix_to_angle_axis( + pred_rotmat_hom).contiguous().view(batch_size, -1) + # tgm.rotation_matrix_to_angle_axis returns NaN for 0 rotation, + # so manually hack it + pred_pose[torch.isnan(pred_pose)] = 0.0 + + registrant_output = self.registrant( + keypoints2d=gt_keypoints_2d_orig[:, :, :2], + keypoints2d_conf=gt_keypoints_2d_orig[:, :, 2], + init_global_orient=pred_pose[:, :3], + init_transl=pred_cam_t, + init_body_pose=pred_pose[:, 3:], + init_betas=pred_betas, + return_joints=True, + return_verts=True, + return_losses=True) + new_opt_vertices = registrant_output[ + 'vertices'] - pred_cam_t.unsqueeze(1) + new_opt_joints = registrant_output[ + 'joints'] - pred_cam_t.unsqueeze(1) + + new_opt_global_orient = registrant_output['global_orient'] + new_opt_body_pose = registrant_output['body_pose'] + new_opt_pose = torch.cat( + [new_opt_global_orient, new_opt_body_pose], dim=1) + + new_opt_betas = registrant_output['betas'] + new_opt_cam_t = registrant_output['transl'] + new_opt_joint_loss = registrant_output['keypoint2d_loss'].sum( + dim=-1).sum(dim=-1) + + # Will update the dictionary for the examples where the new loss + # is less than the current one + update = (new_opt_joint_loss < opt_joint_loss) + + opt_joint_loss[update] = new_opt_joint_loss[update] + opt_vertices[update, :] = new_opt_vertices[update, :] + opt_joints[update, :] = new_opt_joints[update, :] + opt_pose[update, :] = new_opt_pose[update, :] + opt_betas[update, :] = new_opt_betas[update, :] + opt_cam_t[update, :] = new_opt_cam_t[update, :] + + self.fits_dict[(dataset_name, indices.cpu(), rot_angle.cpu(), + is_flipped.cpu(), + update.cpu())] = (opt_pose.cpu(), opt_betas.cpu()) + + # Replace extreme betas with zero betas + opt_betas[(opt_betas.abs() > 3).any(dim=-1)] = 0. + + # Replace the optimized parameters with the ground truth parameters, + # if available + opt_vertices[has_smpl, :, :] = gt_vertices[has_smpl, :, :] + opt_cam_t[has_smpl, :] = gt_cam_t[has_smpl, :] + opt_joints[has_smpl, :, :] = gt_model_joints[has_smpl, :, :] + opt_pose[has_smpl, 3:] = gt_pose[has_smpl, :] + opt_pose[has_smpl, :3] = gt_global_orient[has_smpl, :] + opt_betas[has_smpl, :] = gt_betas[has_smpl, :] + + # Assert whether a fit is valid by comparing the joint loss with + # the threshold + valid_fit = (opt_joint_loss < threshold).to(device) + valid_fit = valid_fit | has_smpl + targets['valid_fit'] = valid_fit + + targets['opt_vertices'] = opt_vertices + targets['opt_cam_t'] = opt_cam_t + targets['opt_joints'] = opt_joints + targets['opt_pose'] = opt_pose + targets['opt_betas'] = opt_betas + + return targets + + def optimize_discrinimator(self, predictions: dict, data_batch: dict, + optimizer: dict): + """Optimize discrinimator during adversarial training.""" + set_requires_grad(self.disc, True) + fake_data = self.make_fake_data(predictions, requires_grad=False) + real_data = self.make_real_data(data_batch) + fake_score = self.disc(fake_data) + real_score = self.disc(real_data) + + disc_losses = {} + disc_losses['real_loss'] = self.loss_adv( + real_score, target_is_real=True, is_disc=True) + disc_losses['fake_loss'] = self.loss_adv( + fake_score, target_is_real=False, is_disc=True) + loss_disc, log_vars_d = self._parse_losses(disc_losses) + + optimizer['disc'].zero_grad() + loss_disc.backward() + optimizer['disc'].step() + + def optimize_generator(self, predictions: dict): + """Optimize generator during adversarial training.""" + set_requires_grad(self.disc, False) + fake_data = self.make_fake_data(predictions, requires_grad=True) + pred_score = self.disc(fake_data) + loss_adv = self.loss_adv( + pred_score, target_is_real=True, is_disc=False) + loss = dict(adv_loss=loss_adv) + return loss + + def compute_keypoints3d_loss( + self, + pred_keypoints3d: torch.Tensor, + gt_keypoints3d: torch.Tensor, + has_keypoints3d: Optional[torch.Tensor] = None): + """Compute loss for 3d keypoints.""" + keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1) + keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3) + pred_keypoints3d = pred_keypoints3d.float() + gt_keypoints3d = gt_keypoints3d[:, :, :3].float() + + # currently, only mpi_inf_3dhp and h36m have 3d keypoints + # both datasets have right_hip_extra and left_hip_extra + right_hip_idx = get_keypoint_idx('right_hip_extra', self.convention) + left_hip_idx = get_keypoint_idx('left_hip_extra', self.convention) + gt_pelvis = (gt_keypoints3d[:, right_hip_idx, :] + + gt_keypoints3d[:, left_hip_idx, :]) / 2 + pred_pelvis = (pred_keypoints3d[:, right_hip_idx, :] + + pred_keypoints3d[:, left_hip_idx, :]) / 2 + + gt_keypoints3d = gt_keypoints3d - gt_pelvis[:, None, :] + pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, None, :] + loss = self.loss_keypoints3d( + pred_keypoints3d, gt_keypoints3d, reduction_override='none') + + # If has_keypoints3d is not None, then computes the losses on the + # instances that have ground-truth keypoints3d. + # But the zero confidence keypoints will be included in mean. + # Otherwise, only compute the keypoints3d + # which have positive confidence. + + # has_keypoints3d is None when the key has_keypoints3d + # is not in the datasets + if has_keypoints3d is None: + + valid_pos = keypoints3d_conf > 0 + if keypoints3d_conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_keypoints3d) + loss = torch.sum(loss * keypoints3d_conf) + loss /= keypoints3d_conf[valid_pos].numel() + else: + + keypoints3d_conf = keypoints3d_conf[has_keypoints3d == 1] + if keypoints3d_conf.shape[0] == 0: + return torch.Tensor([0]).type_as(gt_keypoints3d) + loss = loss[has_keypoints3d == 1] + loss = (loss * keypoints3d_conf).mean() + return loss + + def compute_keypoints2d_loss( + self, + pred_keypoints3d: torch.Tensor, + pred_cam: torch.Tensor, + gt_keypoints2d: torch.Tensor, + img_res: Optional[int] = 224, + focal_length: Optional[int] = 5000, + has_keypoints2d: Optional[torch.Tensor] = None): + """Compute loss for 2d keypoints.""" + keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1) + keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2) + gt_keypoints2d = gt_keypoints2d[:, :, :2].float() + pred_keypoints2d = project_points( + pred_keypoints3d, + pred_cam, + focal_length=focal_length, + img_res=img_res) + # Normalize keypoints to [-1,1] + # The coordinate origin of pred_keypoints_2d is + # the center of the input image. + pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1) + # The coordinate origin of gt_keypoints_2d is + # the top left corner of the input image. + gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1 + loss = self.loss_keypoints2d( + pred_keypoints2d, gt_keypoints2d, reduction_override='none') + + # If has_keypoints2d is not None, then computes the losses on the + # instances that have ground-truth keypoints2d. + # But the zero confidence keypoints will be included in mean. + # Otherwise, only compute the keypoints2d + # which have positive confidence. + # has_keypoints2d is None when the key has_keypoints2d + # is not in the datasets + + if has_keypoints2d is None: + valid_pos = keypoints2d_conf > 0 + if keypoints2d_conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + loss = torch.sum(loss * keypoints2d_conf) + loss /= keypoints2d_conf[valid_pos].numel() + else: + keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1] + if keypoints2d_conf.shape[0] == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + loss = loss[has_keypoints2d == 1] + loss = (loss * keypoints2d_conf).mean() + + return loss + + def compute_keypoints2d_loss_cliff( + self, + pred_keypoints3d: torch.Tensor, + pred_cam: torch.Tensor, + gt_keypoints2d: torch.Tensor, + camera_center: torch.Tensor, + focal_length: torch.Tensor, + trans: torch.Tensor, + img_res: Optional[int] = 224, + has_keypoints2d: Optional[torch.Tensor] = None): + """Compute loss for 2d keypoints.""" + keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1) + keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2) + gt_keypoints2d = gt_keypoints2d[:, :, :2].float() + + device = gt_keypoints2d.device + batch_size, num_keypoints = pred_keypoints3d.shape[0:2] + + pred_keypoints2d = perspective_projection( + pred_keypoints3d, + rotation=torch.eye(3, device=device).unsqueeze(0).expand( + batch_size, -1, -1), + translation=pred_cam, + focal_length=focal_length, + camera_center=camera_center) + + pred_keypoints2d = torch.cat( + (pred_keypoints2d, torch.ones(batch_size, num_keypoints, + 1).to(device)), + dim=2) + # trans @ pred_keypoints2d2 + pred_keypoints2d = torch.einsum('bij,bkj->bki', trans, + pred_keypoints2d) + + # The coordinate origin of pred_keypoints_2d and gt_keypoints_2d is + # the top left corner of the input image. + pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1) - 1 + gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1 + loss = self.loss_keypoints2d( + pred_keypoints2d, gt_keypoints2d, reduction_override='none') + + # If has_keypoints2d is not None, then computes the losses on the + # instances that have ground-truth keypoints2d. + # But the zero confidence keypoints will be included in mean. + # Otherwise, only compute the keypoints2d + # which have positive confidence. + # has_keypoints2d is None when the key has_keypoints2d + # is not in the datasets + + if has_keypoints2d is None: + valid_pos = keypoints2d_conf > 0 + if keypoints2d_conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + loss = torch.sum(loss * keypoints2d_conf) + loss /= keypoints2d_conf[valid_pos].numel() + else: + keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1] + if keypoints2d_conf.shape[0] == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + loss = loss[has_keypoints2d == 1] + loss = (loss * keypoints2d_conf).mean() + + return loss + + def compute_vertex_loss(self, pred_vertices: torch.Tensor, + gt_vertices: torch.Tensor, has_smpl: torch.Tensor): + """Compute loss for vertices.""" + gt_vertices = gt_vertices.float() + conf = has_smpl.float().view(-1, 1, 1) + conf = conf.repeat(1, gt_vertices.shape[1], gt_vertices.shape[2]) + loss = self.loss_vertex( + pred_vertices, gt_vertices, reduction_override='none') + valid_pos = conf > 0 + if conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_vertices) + loss = torch.sum(loss * conf) / conf[valid_pos].numel() + return loss + + def compute_smpl_pose_loss(self, pred_rotmat: torch.Tensor, + gt_pose: torch.Tensor, has_smpl: torch.Tensor): + """Compute loss for smpl pose.""" + conf = has_smpl.float().view(-1) + valid_pos = conf > 0 + if conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_pose) + pred_rotmat = pred_rotmat[valid_pos] + gt_pose = gt_pose[valid_pos] + conf = conf[valid_pos] + gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3) + loss = self.loss_smpl_pose( + pred_rotmat, gt_rotmat, reduction_override='none') + loss = loss.view(loss.shape[0], -1).mean(-1) + loss = torch.mean(loss * conf) + return loss + + def compute_smpl_betas_loss(self, pred_betas: torch.Tensor, + gt_betas: torch.Tensor, + has_smpl: torch.Tensor): + """Compute loss for smpl betas.""" + conf = has_smpl.float().view(-1) + valid_pos = conf > 0 + if conf[valid_pos].numel() == 0: + return torch.Tensor([0]).type_as(gt_betas) + pred_betas = pred_betas[valid_pos] + gt_betas = gt_betas[valid_pos] + conf = conf[valid_pos] + loss = self.loss_smpl_betas( + pred_betas, gt_betas, reduction_override='none') + loss = loss.view(loss.shape[0], -1).mean(-1) + loss = torch.mean(loss * conf) + return loss + + def compute_camera_loss(self, cameras: torch.Tensor): + """Compute loss for predicted camera parameters.""" + loss = self.loss_camera(cameras) + return loss + + def compute_part_segmentation_loss(self, + pred_heatmap: torch.Tensor, + gt_vertices: torch.Tensor, + gt_keypoints2d: torch.Tensor, + gt_model_joints: torch.Tensor, + has_smpl: torch.Tensor, + img_res: Optional[int] = 224, + focal_length: Optional[int] = 500): + """Compute loss for part segmentations.""" + device = gt_keypoints2d.device + gt_keypoints2d_valid = gt_keypoints2d[has_smpl == 1] + batch_size = gt_keypoints2d_valid.shape[0] + + gt_vertices_valid = gt_vertices[has_smpl == 1] + gt_model_joints_valid = gt_model_joints[has_smpl == 1] + + if batch_size == 0: + return torch.Tensor([0]).type_as(gt_keypoints2d) + gt_cam_t = estimate_translation( + gt_model_joints_valid, + gt_keypoints2d_valid, + focal_length=focal_length, + img_size=img_res, + ) + + K = torch.eye(3) + K[0, 0] = focal_length + K[1, 1] = focal_length + K[2, 2] = 1 + K[0, 2] = img_res / 2. + K[1, 2] = img_res / 2. + K = K[None, :, :] + + R = torch.eye(3)[None, :, :] + device = gt_keypoints2d.device + gt_sem_mask = visualize_smpl.render_smpl( + verts=gt_vertices_valid, + R=R, + K=K, + T=gt_cam_t, + render_choice='part_silhouette', + resolution=img_res, + return_tensor=True, + body_model=self.body_model_train, + device=device, + in_ndc=False, + convention='pytorch3d', + projection='perspective', + no_grad=True, + batch_size=batch_size, + verbose=False, + ) + gt_sem_mask = torch.flip(gt_sem_mask, [1, 2]).squeeze(-1).detach() + pred_heatmap_valid = pred_heatmap[has_smpl == 1] + ph, pw = pred_heatmap_valid.size(2), pred_heatmap_valid.size(3) + h, w = gt_sem_mask.size(1), gt_sem_mask.size(2) + if ph != h or pw != w: + pred_heatmap_valid = F.interpolate( + input=pred_heatmap_valid, size=(h, w), mode='bilinear') + + loss = self.loss_segm_mask(pred_heatmap_valid, gt_sem_mask) + return loss + + def compute_losses(self, predictions: dict, targets: dict): + """Compute losses.""" + pred_betas = predictions['pred_shape'].view(-1, 10) + pred_pose = predictions['pred_pose'].view(-1, 24, 3, 3) + pred_cam_crop = predictions['pred_cam'].view(-1, 3) + + # NOTE: convert cam parameters from the crop to the full camera + img_h, img_w = targets['img_h'], targets['img_w'] + center, scale, focal_length = targets['center'], targets[ + 'scale'][:, 0], targets['focal_length'].squeeze(dim=1) + full_img_shape = torch.hstack((img_h, img_w)) + pred_cam = cam_crop2full(pred_cam_crop, center, scale, full_img_shape, + focal_length).to(torch.float32) + + gt_keypoints3d = targets['keypoints3d'] + # this should be in full frame + gt_keypoints2d = targets['keypoints2d'] + # pred_pose N, 24, 3, 3 + if self.body_model_train is not None: + pred_output = self.body_model_train( + betas=pred_betas, + body_pose=pred_pose[:, 1:], + global_orient=pred_pose[:, 0].unsqueeze(1), + pose2rot=False, + num_joints=gt_keypoints2d.shape[1]) + pred_keypoints3d = pred_output['joints'] + pred_vertices = pred_output['vertices'] + + # NOTE: use crop_trans to contain full -> crop so that pred keypoints + # are normalized to bbox + camera_center = torch.hstack((img_w, img_h)) / 2 + trans = targets['crop_trans'].float() + + # TODO: temp solution + if 'valid_fit' in targets: + has_smpl = targets['valid_fit'].view(-1) + # global_orient = targets['opt_pose'][:, :3].view(-1, 1, 3) + gt_pose = targets['opt_pose'] + gt_betas = targets['opt_betas'] + gt_vertices = targets['opt_vertices'] + else: + has_smpl = targets['has_smpl'].view(-1) + gt_pose = targets['smpl_body_pose'] + global_orient = targets['smpl_global_orient'].view(-1, 1, 3) + gt_pose = torch.cat((global_orient, gt_pose), dim=1).float() + gt_betas = targets['smpl_betas'].float() + + # gt_pose N, 72 + if self.body_model_train is not None: + gt_output = self.body_model_train( + betas=gt_betas, + body_pose=gt_pose[:, 3:], + global_orient=gt_pose[:, :3], + num_joints=gt_keypoints2d.shape[1]) + gt_vertices = gt_output['vertices'] + gt_model_joints = gt_output['joints'] + if 'has_keypoints3d' in targets: + has_keypoints3d = targets['has_keypoints3d'].squeeze(-1) + else: + has_keypoints3d = None + if 'has_keypoints2d' in targets: + has_keypoints2d = targets['has_keypoints2d'].squeeze(-1) + else: + has_keypoints2d = None + if 'pred_segm_mask' in predictions: + pred_segm_mask = predictions['pred_segm_mask'] + losses = {} + if self.loss_keypoints3d is not None: + losses['keypoints3d_loss'] = self.compute_keypoints3d_loss( + pred_keypoints3d, + gt_keypoints3d, + has_keypoints3d=has_keypoints3d) + if self.loss_keypoints2d is not None: + losses['keypoints2d_loss'] = self.compute_keypoints2d_loss_cliff( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + camera_center, + focal_length, + trans, + has_keypoints2d=has_keypoints2d) + if self.loss_vertex is not None: + losses['vertex_loss'] = self.compute_vertex_loss( + pred_vertices, gt_vertices, has_smpl) + if self.loss_smpl_pose is not None: + losses['smpl_pose_loss'] = self.compute_smpl_pose_loss( + pred_pose, gt_pose, has_smpl) + if self.loss_smpl_betas is not None: + losses['smpl_betas_loss'] = self.compute_smpl_betas_loss( + pred_betas, gt_betas, has_smpl) + if self.loss_camera is not None: + losses['camera_loss'] = self.compute_camera_loss(pred_cam) + if self.loss_segm_mask is not None: + losses['loss_segm_mask'] = self.compute_part_segmentation_loss( + pred_segm_mask, gt_vertices, gt_keypoints2d, gt_model_joints, + has_smpl) + + return losses + + @abstractmethod + def make_fake_data(self, predictions, requires_grad): + pass + + @abstractmethod + def make_real_data(self, data_batch): + pass + + @abstractmethod + def prepare_targets(self, data_batch): + pass + + def forward_train(self, **kwargs): + """Forward function for general training. + + For mesh estimation, we do not use this interface. + """ + raise NotImplementedError('This interface should not be used in ' + 'current training schedule. Please use ' + '`train_step` for training.') + + @abstractmethod + def forward_test(self, img, img_metas, **kwargs): + """Defines the computation performed at every call when testing.""" + pass + + +class CliffImageBodyModelEstimator(BodyModelEstimator): + + def make_fake_data(self, predictions: dict, requires_grad: bool): + pred_cam = predictions['pred_cam'] + pred_pose = predictions['pred_pose'] + pred_betas = predictions['pred_shape'] + if requires_grad: + fake_data = (pred_cam, pred_pose, pred_betas) + else: + fake_data = (pred_cam.detach(), pred_pose.detach(), + pred_betas.detach()) + return fake_data + + def make_real_data(self, data_batch: dict): + transl = data_batch['adv_smpl_transl'].float() + global_orient = data_batch['adv_smpl_global_orient'] + body_pose = data_batch['adv_smpl_body_pose'] + betas = data_batch['adv_smpl_betas'].float() + pose = torch.cat((global_orient, body_pose), dim=-1).float() + real_data = (transl, pose, betas) + return real_data + + def prepare_targets(self, data_batch: dict): + # Image Mesh Estimator does not need extra process for ground truth + return data_batch + + def forward_test(self, img: torch.Tensor, img_metas: dict, **kwargs): + """Defines the computation performed at every call when testing.""" + if self.backbone is not None: + features = self.backbone(img) + else: + features = kwargs['features'] + + if self.neck is not None: + features = self.neck(features) + + # NOTE: extras for Cliff inference + bbox_info = kwargs['bbox_info'] + predictions = self.head(features, bbox_info) + pred_pose = predictions['pred_pose'] + pred_betas = predictions['pred_shape'] + pred_cam_crop = predictions['pred_cam'].view(-1, 3) + + # convert the camera parameters from the crop camera to the full camera + img_h, img_w = kwargs['img_h'], kwargs['img_w'] + center, scale, focal_length = kwargs['center'], kwargs[ + 'scale'][:, 0], kwargs['focal_length'].squeeze(dim=1) + full_img_shape = torch.hstack((img_h, img_w)) + + pred_cam = cam_crop2full(pred_cam_crop, center, scale, full_img_shape, + focal_length).to(torch.float32) + + pred_output = self.body_model_test( + betas=pred_betas, + body_pose=pred_pose[:, 1:], + global_orient=pred_pose[:, 0].unsqueeze(1), + pose2rot=False) + + pred_vertices = pred_output['vertices'] + pred_keypoints_3d = pred_output['joints'] + all_preds = {} + all_preds['keypoints_3d'] = pred_keypoints_3d.detach().cpu().numpy() + all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy() + all_preds['smpl_beta'] = pred_betas.detach().cpu().numpy() + all_preds['camera'] = pred_cam.detach().cpu().numpy() + all_preds['vertices'] = pred_vertices.detach().cpu().numpy() + image_path = [] + for img_meta in img_metas: + image_path.append(img_meta['image_path']) + all_preds['image_path'] = image_path + all_preds['image_idx'] = kwargs['sample_idx'] + return all_preds diff --git a/mmhuman3d/models/heads/builder.py b/mmhuman3d/models/heads/builder.py index 5e15c8ef..bdaa7e12 100644 --- a/mmhuman3d/models/heads/builder.py +++ b/mmhuman3d/models/heads/builder.py @@ -2,6 +2,7 @@ from mmcv.utils import Registry +from .cliff_head import CliffHead from .expose_head import ExPoseBodyHead, ExPoseFaceHead, ExPoseHandHead from .hmr_head import HMRHead from .hybrik_head import HybrIKHead @@ -16,6 +17,7 @@ HEADS.register_module(name='ExPoseBodyHead', module=ExPoseBodyHead) HEADS.register_module(name='ExPoseHandHead', module=ExPoseHandHead) HEADS.register_module(name='ExPoseFaceHead', module=ExPoseFaceHead) +HEADS.register_module(name='CliffHead', module=CliffHead) HEADS.register_module(name='PyMAFXHead', module=PyMAFXHead) HEADS.register_module(name='Regressor', module=Regressor) diff --git a/mmhuman3d/models/heads/cliff_head.py b/mmhuman3d/models/heads/cliff_head.py new file mode 100644 index 00000000..037e37d1 --- /dev/null +++ b/mmhuman3d/models/heads/cliff_head.py @@ -0,0 +1,98 @@ +import numpy as np +import torch +import torch.nn as nn +from mmcv.runner.base_module import BaseModule + +from mmhuman3d.utils.geometry import rot6d_to_rotmat + + +class CliffHead(BaseModule): + + def __init__(self, + feat_dim, + smpl_mean_params=None, + npose=144, + nbeta=10, + ncam=3, + nbbox=3, + hdim=1024, + init_cfg=None): + super(CliffHead, self).__init__(init_cfg=init_cfg) + self.fc1 = nn.Linear(feat_dim + nbbox + npose + nbeta + ncam, hdim) + self.drop1 = nn.Dropout() + self.fc2 = nn.Linear(hdim, hdim) + self.drop2 = nn.Dropout() + self.decpose = nn.Linear(hdim, npose) + self.decshape = nn.Linear(hdim, nbeta) + self.deccam = nn.Linear(hdim, ncam) + + nn.init.xavier_uniform_(self.decpose.weight, gain=0.01) + nn.init.xavier_uniform_(self.decshape.weight, gain=0.01) + nn.init.xavier_uniform_(self.deccam.weight, gain=0.01) + + if smpl_mean_params is None: + init_pose = torch.zeros([1, npose]) + init_shape = torch.zeros([1, nbeta]) + init_cam = torch.FloatTensor([[1, 0, 0]]) + else: + mean_params = np.load(smpl_mean_params) + init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) + init_shape = torch.from_numpy( + mean_params['shape'][:].astype('float32')).unsqueeze(0) + init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0) + self.register_buffer('init_pose', init_pose) + self.register_buffer('init_shape', init_shape) + self.register_buffer('init_cam', init_cam) + + def forward(self, + x, + bbox_info, + init_pose=None, + init_shape=None, + init_cam=None, + n_iter=3): + + # inherited from hmr head, only support one layer feature + if isinstance(x, list) or isinstance(x, tuple): + x = x[-1] + + output_seq = False + if len(x.shape) == 4: + # use feature from the last layer of the backbone + # apply global average pooling on the feature map + x = x.mean(dim=-1).mean(dim=-1) + elif len(x.shape) == 3: + # temporal feature + raise NotImplementedError + + batch_size = x.shape[0] + if init_pose is None: + init_pose = self.init_pose.expand(batch_size, -1) + if init_shape is None: + init_shape = self.init_shape.expand(batch_size, -1) + if init_cam is None: + init_cam = self.init_cam.expand(batch_size, -1) + + pred_pose = init_pose + pred_shape = init_shape + pred_cam = init_cam + for i in range(n_iter): + xc = torch.cat([x, bbox_info, pred_pose, pred_shape, pred_cam], 1) + xc = self.fc1(xc) + xc = self.drop1(xc) + xc = self.fc2(xc) + xc = self.drop2(xc) + pred_pose = self.decpose(xc) + pred_pose + pred_shape = self.decshape(xc) + pred_shape + pred_cam = self.deccam(xc) + pred_cam + + pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3) + + if output_seq: + raise NotImplementedError + output = { + 'pred_pose': pred_rotmat, + 'pred_shape': pred_shape, + 'pred_cam': pred_cam + } + return output diff --git a/mmhuman3d/utils/geometry.py b/mmhuman3d/utils/geometry.py index 88dcdccd..09a8cef4 100644 --- a/mmhuman3d/utils/geometry.py +++ b/mmhuman3d/utils/geometry.py @@ -417,6 +417,27 @@ def weak_perspective_projection(points, scale, translation): return projected_points +def cam_crop2full(crop_cam, center, scale, full_img_shape, focal_length): + """convert the camera parameters from the crop camera to the full camera. + + :param crop_cam: shape=(N, 3) weak perspective camera in cropped + img coordinates (s, tx, ty) + :param center: shape=(N, 2) bbox coordinates (c_x, c_y) + :param scale: shape=(N, 1) square bbox resolution (b / 200) + :param full_img_shape: shape=(N, 2) original image height and width + :param focal_length: shape=(N,) + :return: + """ + img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1] + cx, cy, b = center[:, 0], center[:, 1], scale + bs = b * crop_cam[:, 0] + 1e-9 + tz = 2 * focal_length / bs + tx = (2 * (cx - img_w / 2.) / bs) + crop_cam[:, 1] + ty = (2 * (cy - img_h / 2.) / bs) + crop_cam[:, 2] + full_cam = torch.stack([tx, ty, tz], dim=-1) + return full_cam + + def projection(pred_joints, pred_camera, iwp_mode=True): """Project 3D points on the image plane based on the given camera info, Identity rotation and Weak Perspective (IWP) camera is used when diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 40562562..cb4edc25 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -8,7 +8,7 @@ h5py matplotlib numpy opencv-python -pandas +pandas<2.0.0 pickle5 plyfile rtree diff --git a/setup.cfg b/setup.cfg index 8bc3c131..a5899aa2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,6 +15,6 @@ multi_line_output = 3 include_trailing_comma = true known_standard_library = pkg_resources,setuptools known_first_party = mmhuman3d -known_third_party =PIL,cdflib,colormap,cv2,einops,h5py,matplotlib,mmcv,mpl_toolkits,numpy,openpifpaf,pickle5,plyfile,pytest,pytorch3d,pytorch_sphinx_theme,scipy,skimage,smplx,surrogate,torch,tqdm,trimesh,vedo +known_third_party = PIL,cdflib,colormap,cv2,einops,h5py,matplotlib,mmcv,mpl_toolkits,numpy,openpifpaf,pickle5,plyfile,pytest,pytorch3d,pytorch_sphinx_theme,scipy,skimage,smplx,surrogate,torch,tqdm,trimesh,vedo no_lines_before = STDLIB,LOCALFOLDER default_section = THIRDPARTY diff --git a/tests/test_data_converters.py b/tests/test_data_converters.py index 4210e2fb..b9e77f80 100644 --- a/tests/test_data_converters.py +++ b/tests/test_data_converters.py @@ -323,6 +323,14 @@ def test_multi_human_data_preprocess(): assert os.path.exists('/tmp/preprocessed_npzs/' + 'crowdpose_test.npz') assert os.path.exists('/tmp/preprocessed_npzs/' + 'crowdpose_trainval.npz') + CLIFF_ROOT = os.path.join(root_path, 'eft') + cfg = dict(type='CliffConverter', modes=['coco', 'mpii']) + data_converter = build_data_converter(cfg) + data_converter.convert( + CLIFF_ROOT, output_path, enable_multi_human_data=True) + assert os.path.exists('/tmp/preprocessed_npzs/' + 'cliff_coco_train.npz') + assert os.path.exists('/tmp/preprocessed_npzs/' + 'cliff_mpii_train.npz') + def test_preprocessed_npz(): npz_folder = '/tmp/preprocessed_npzs' diff --git a/tests/test_datasets/test_pipelines.py b/tests/test_datasets/test_pipelines.py index 4e0daf61..f14a003a 100644 --- a/tests/test_datasets/test_pipelines.py +++ b/tests/test_datasets/test_pipelines.py @@ -2,6 +2,7 @@ import pytest from mmhuman3d.data.datasets.pipelines import ( + GetBboxInfo, LoadImageFromFile, SyntheticOcclusion, ) @@ -57,3 +58,17 @@ def test_synthetic_occlusion(): results = pipeline(results) assert results['img'].shape == (224, 224, 3) + + +def test_get_bbox_inf(): + pipeline = GetBboxInfo() + results = { + 'img': np.ones((224, 224, 3)), + 'center': np.array([100, 100]), + 'scale': np.array([10, 10]) + } + pipeline(results=results) + assert 'img_h' in results + assert 'img_w' in results + assert 'focal_length' in results + assert 'bbox_info' in results diff --git a/tests/test_models/test_architectures/test_cliff_mesh_estimator.py b/tests/test_models/test_architectures/test_cliff_mesh_estimator.py new file mode 100644 index 00000000..2896d787 --- /dev/null +++ b/tests/test_models/test_architectures/test_cliff_mesh_estimator.py @@ -0,0 +1,417 @@ +import torch + +from mmhuman3d.core.cameras import build_cameras +from mmhuman3d.models.architectures.cliff_mesh_estimator import \ + CliffImageBodyModelEstimator # noqa: E501 +from mmhuman3d.models.body_models.builder import build_body_model +from mmhuman3d.utils.geometry import project_points + + +def test_cliff_image_body_mesh_estimator(): + backbone = dict( + type='ResNet', + depth=50, + out_indices=[3], + norm_eval=False, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')) + head = dict( + type='CliffHead', + feat_dim=2048, + smpl_mean_params='data/body_models/smpl_mean_params.npz') + body_model_train = dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_54', + model_path='data/body_models/smpl', + keypoint_approximate=True, + extra_joints_regressor='data/body_models/J_regressor_extra.npy') + body_model_test = dict( + type='SMPL', + keypoint_src='h36m', + keypoint_dst='h36m', + model_path='data/body_models/smpl', + joints_regressor='data/body_models/J_regressor_h36m.npy') + convention = 'smpl_54' + loss_keypoints3d = dict(type='SmoothL1Loss', loss_weight=100) + loss_keypoints2d = dict(type='SmoothL1Loss', loss_weight=10) + loss_vertex = dict(type='L1Loss', loss_weight=2) + loss_smpl_pose = dict(type='MSELoss', loss_weight=3) + loss_smpl_betas = dict(type='MSELoss', loss_weight=0.02) + loss_adv = dict( + type='GANLoss', + gan_type='lsgan', + real_label_val=1.0, + fake_label_val=0.0, + loss_weight=1) + model = CliffImageBodyModelEstimator( + backbone=backbone, + head=head, + body_model_train=body_model_train, + body_model_test=body_model_test, + convention=convention, + loss_keypoints3d=loss_keypoints3d, + loss_keypoints2d=loss_keypoints2d, + loss_vertex=loss_vertex, + loss_smpl_pose=loss_smpl_pose, + loss_smpl_betas=loss_smpl_betas, + loss_adv=loss_adv) + assert model.backbone is not None + assert model.head is not None + assert model.body_model_train is not None + assert model.body_model_test is not None + assert model.convention == 'smpl_54' + assert model.loss_keypoints3d is not None + assert model.loss_keypoints2d is not None + assert model.loss_vertex is not None + assert model.loss_smpl_pose is not None + assert model.loss_smpl_betas is not None + assert model.loss_adv is not None + + +def test_compute_keypoints3d_loss(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', + loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100)) + + pred_keypoints3d = torch.zeros((32, 54, 3)) + gt_keypoints3d = torch.zeros((32, 54, 4)) + loss_empty = model.compute_keypoints3d_loss(pred_keypoints3d, + gt_keypoints3d) + assert loss_empty == 0 + + pred_keypoints3d = torch.randn((32, 54, 3)) + gt_keypoints3d = torch.randn((32, 54, 4)) + gt_keypoints3d[:, :, 3] = torch.sigmoid(gt_keypoints3d[:, :, 3]) + loss = model.compute_keypoints3d_loss(pred_keypoints3d, gt_keypoints3d) + assert loss > 0 + + has_keypoints3d = torch.ones(32) + loss = model.compute_keypoints3d_loss( + pred_keypoints3d, gt_keypoints3d, has_keypoints3d=has_keypoints3d) + assert loss > 0 + has_keypoints3d = torch.zeros(32) + loss = model.compute_keypoints3d_loss( + pred_keypoints3d, gt_keypoints3d, has_keypoints3d=has_keypoints3d) + assert loss == 0 + + +def test_compute_keypoints2d_loss_cliff(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', + loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10)) + + pred_keypoints3d = torch.zeros((32, 54, 3)) + gt_keypoints2d = torch.zeros((32, 54, 3)) + pred_cam = torch.randn((32, 3)) + camera_center = torch.randn((32, 2)) + trans = torch.randn((32, 2, 3)) + focal_length = 5000 + loss_empty = model.compute_keypoints2d_loss_cliff(pred_keypoints3d, + pred_cam, gt_keypoints2d, + camera_center, + focal_length, trans) + assert loss_empty == 0 + + pred_keypoints3d = torch.randn((32, 54, 3)) + gt_keypoints2d = torch.randn((32, 54, 3)) + gt_keypoints2d[:, :, 2] = torch.sigmoid(gt_keypoints2d[:, :, 2]) + pred_cam = torch.randn((32, 3)) + loss = model.compute_keypoints2d_loss_cliff(pred_keypoints3d, pred_cam, + gt_keypoints2d, camera_center, + focal_length, trans) + assert loss > 0 + + has_keypoints2d = torch.ones((32)) + loss = model.compute_keypoints2d_loss_cliff( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + camera_center, + focal_length, + trans, + has_keypoints2d=has_keypoints2d) + assert loss > 0 + + has_keypoints2d = torch.zeros((32)) + loss = model.compute_keypoints2d_loss_cliff( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + camera_center, + focal_length, + trans, + has_keypoints2d=has_keypoints2d) + assert loss == 0 + + +def test_compute_keypoints2d_loss(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', + loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10)) + + pred_keypoints3d = torch.zeros((32, 54, 3)) + gt_keypoints2d = torch.zeros((32, 54, 3)) + pred_cam = torch.randn((32, 3)) + loss_empty = model.compute_keypoints2d_loss(pred_keypoints3d, pred_cam, + gt_keypoints2d) + assert loss_empty == 0 + + pred_keypoints3d = torch.randn((32, 54, 3)) + gt_keypoints2d = torch.randn((32, 54, 3)) + gt_keypoints2d[:, :, 2] = torch.sigmoid(gt_keypoints2d[:, :, 2]) + pred_cam = torch.randn((32, 3)) + loss = model.compute_keypoints2d_loss(pred_keypoints3d, pred_cam, + gt_keypoints2d) + assert loss > 0 + + has_keypoints2d = torch.ones((32)) + loss = model.compute_keypoints2d_loss( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + has_keypoints2d=has_keypoints2d) + assert loss > 0 + + has_keypoints2d = torch.zeros((32)) + loss = model.compute_keypoints2d_loss( + pred_keypoints3d, + pred_cam, + gt_keypoints2d, + has_keypoints2d=has_keypoints2d) + assert loss == 0 + + +def test_compute_vertex_loss(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', loss_vertex=dict(type='L1Loss', loss_weight=2)) + + pred_vertices = torch.randn((32, 4096, 3)) + gt_vertices = torch.randn((32, 4096, 3)) + has_smpl = torch.zeros((32)) + loss_empty = model.compute_vertex_loss(pred_vertices, gt_vertices, + has_smpl) + assert loss_empty == 0 + + pred_vertices = torch.randn((32, 4096, 3)) + gt_vertices = torch.randn((32, 4096, 3)) + has_smpl = torch.ones((32)) + loss = model.compute_vertex_loss(pred_vertices, gt_vertices, has_smpl) + assert loss > 0 + + +def test_compute_smpl_pose_loss(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', + loss_smpl_pose=dict(type='MSELoss', loss_weight=3)) + + pred_rotmat = torch.randn((32, 24, 3, 3)) + gt_pose = torch.randn((32, 24, 3)) + has_smpl = torch.zeros((32)) + loss_empty = model.compute_smpl_pose_loss(pred_rotmat, gt_pose, has_smpl) + assert loss_empty == 0 + + pred_rotmat = torch.randn((32, 24, 3, 3)) + gt_pose = torch.randn((32, 24, 3)) + has_smpl = torch.ones((32)) + loss = model.compute_smpl_pose_loss(pred_rotmat, gt_pose, has_smpl) + assert loss > 0 + + +def test_compute_part_segm_loss(): + N = 1 + random_body_pose = torch.rand((N, 69)) + body_model_train = dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_49', + model_path='data/body_models/smpl', + extra_joints_regressor='data/body_models/J_regressor_extra.npy') + body_model = build_body_model(body_model_train) + + body_model_output = body_model(body_pose=random_body_pose, ) + gt_model_joins = body_model_output['joints'].detach() + cam = torch.ones(N, 3) + gt_keypoints2d = project_points( + gt_model_joins, cam, focal_length=5000, img_res=224) + loss_segm_mask = dict(type='CrossEntropyLoss', loss_weight=60) + + gt_keypoints2d = torch.cat([gt_keypoints2d, torch.ones(N, 49, 1)], dim=-1) + model = CliffImageBodyModelEstimator( + body_model_train=body_model_train, + loss_segm_mask=loss_segm_mask, + ) + gt_vertices = torch.randn(N, 6890, 3) + pred_heatmap = torch.zeros(N, 25, 224, 224) + pred_heatmap[:, 0, :, :] = 1 + has_smpl = torch.ones((N)) + + loss = model.compute_part_segmentation_loss( + pred_heatmap, + gt_vertices, + has_smpl=has_smpl, + gt_keypoints2d=gt_keypoints2d, + gt_model_joints=gt_model_joins) + assert loss > 0 + + +def test_compute_smpl_betas_loss(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', + loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02)) + + pred_betas = torch.randn((32, 10)) + gt_betas = torch.randn((32, 10)) + has_smpl = torch.zeros((32)) + loss_empty = model.compute_smpl_betas_loss(pred_betas, gt_betas, has_smpl) + assert loss_empty == 0 + + pred_betas = torch.randn((32, 10)) + gt_betas = torch.randn((32, 10)) + has_smpl = torch.ones((32)) + loss = model.compute_smpl_betas_loss(pred_betas, gt_betas, has_smpl) + assert loss > 0 + + +def test_compute_camera_loss(): + model = CliffImageBodyModelEstimator( + convention='smpl_54', + loss_camera=dict(type='CameraPriorLoss', loss_weight=60), + ) + + pred_cameras = torch.randn((32, 3)) + loss = model.compute_camera_loss(pred_cameras) + assert loss > 0 + + +def test_compute_losses(): + N = 32 + predictions = {} + predictions['pred_shape'] = torch.randn(N, 10) + predictions['pred_pose'] = torch.randn(N, 24, 3, 3) + predictions['pred_cam'] = torch.randn(N, 3) + + targets = {} + targets['keypoints3d'] = torch.randn(N, 45, 4) + targets['keypoints2d'] = torch.randn(N, 45, 3) + targets['has_smpl'] = torch.ones(N) + targets['smpl_body_pose'] = torch.randn(N, 23, 3) + targets['smpl_global_orient'] = torch.randn(N, 3) + targets['smpl_betas'] = torch.randn(N, 10) + targets['img_h'] = torch.ones(N, 1) * 256 + targets['img_w'] = torch.ones(N, 1) * 192 + targets['center'] = torch.randn(N, 2) + targets['scale'] = torch.randn(N, 1) + targets['focal_length'] = torch.randn(N, 1) + targets['crop_trans'] = torch.randn(N, 2, 3) + + model = CliffImageBodyModelEstimator(convention='smpl_54') + loss = model.compute_losses(predictions, targets) + assert loss == {} + + model = CliffImageBodyModelEstimator( + convention='smpl_45', + body_model_train=dict( + type='SMPL', + keypoint_src='smpl_45', + keypoint_dst='smpl_45', + model_path='data/body_models/smpl'), + loss_keypoints3d=dict(type='SmoothL1Loss', loss_weight=100), + loss_keypoints2d=dict(type='SmoothL1Loss', loss_weight=10), + loss_vertex=dict(type='L1Loss', loss_weight=2), + loss_smpl_pose=dict(type='MSELoss', loss_weight=3), + loss_smpl_betas=dict(type='MSELoss', loss_weight=0.02), + loss_camera=dict(type='CameraPriorLoss', loss_weight=60)) + + loss = model.compute_losses(predictions, targets) + assert 'keypoints3d_loss' in loss + assert 'keypoints2d_loss' in loss + assert 'vertex_loss' in loss + assert 'smpl_pose_loss' in loss + assert 'smpl_betas_loss' in loss + assert 'camera_loss' in loss + + +def test_run_registration(): + batch_size = 2 + body_model = dict( + type='SMPL', + keypoint_src='smpl_54', + keypoint_dst='smpl_49', + keypoint_approximate=True, + model_path='data/body_models/smpl', + extra_joints_regressor='data/body_models/J_regressor_extra.npy', + batch_size=batch_size) + + camera = build_cameras( + dict( + type='PerspectiveCameras', + convention='opencv', + in_ndc=False, + focal_length=5000, + image_size=(224, 224), + principal_point=(112, 112))) + + registrant = dict( + type='SMPLify', + body_model=body_model, + num_epochs=1, + stages=[ + dict( + num_iter=1, + fit_global_orient=True, + fit_transl=True, + fit_body_pose=False, + fit_betas=False) + ], + optimizer=dict(type='Adam', lr=1e-2, betas=(0.9, 0.999)), + keypoints2d_loss=dict( + type='KeypointMSELoss', + loss_weight=1.0, + reduction='sum', + sigma=100), + device=torch.device('cpu'), + camera=camera) + + registration = dict(mode='in_the_loop', registrant=registrant) + + model = CliffImageBodyModelEstimator( + body_model_train=body_model, registration=registration) + assert model.registrant is not None + assert model.fits_dict is not None + + transl = torch.Tensor([0, 0, 1]).view(1, 3).expand(batch_size, -1) + + predictions = dict( + pred_pose=torch.zeros((batch_size, 24, 3, 3)), + pred_shape=torch.zeros((batch_size, 10)), + pred_cam=transl, + ) + + # generate 2D keypoints + smpl = build_body_model(body_model) + keypoints3d = smpl(transl=transl)['joints'].detach() + keypoints2d_xyd = camera.transform_points_screen(keypoints3d) + keypoints2d = keypoints2d_xyd[..., :2] + keypoints2d_conf = torch.ones(*keypoints2d.shape[:2], 1) + keypoints2d = torch.cat([keypoints2d, keypoints2d_conf], dim=-1) + + targets = dict( + img_metas=[dict(dataset_name='coco'), + dict(dataset_name='h36m')], + sample_idx=torch.zeros((batch_size, 1), dtype=torch.int), + is_flipped=torch.tensor([0, 1], dtype=torch.int), + rotation=torch.tensor([0.0, 0.1]), + smpl_betas=torch.zeros((batch_size, 10)), + smpl_global_orient=torch.zeros((batch_size, 3)), + smpl_body_pose=torch.zeros((batch_size, 69)), + keypoints2d=keypoints2d, + has_smpl=torch.tensor([0, 1], dtype=torch.int)) + + model.run_registration(predictions=predictions, targets=targets) + assert 'valid_fit' in targets + assert 'opt_vertices' in targets + assert 'opt_cam_t' in targets + assert 'opt_joints' in targets + assert 'opt_pose' in targets + assert 'opt_betas' in targets diff --git a/tests/test_models/test_heads/test_cliff_head.py b/tests/test_models/test_heads/test_cliff_head.py new file mode 100644 index 00000000..691add98 --- /dev/null +++ b/tests/test_models/test_heads/test_cliff_head.py @@ -0,0 +1,59 @@ +import numpy as np +import pytest +import torch + +from mmhuman3d.models.heads.builder import CliffHead + + +def test_cliff_head(): + # initialize models + model = CliffHead( + feat_dim=2048, + smpl_mean_params='data/body_models/smpl_mean_params.npz') + + # image feature from backbone + batch_size = 32 + bbox_info = [-0.5, 0.2, 1.5] + bbox_info = torch.FloatTensor([bbox_info] * batch_size) + x0_shape = (batch_size, 2048, 7, 7) + x0 = _demo_head_inputs(x0_shape) + x0 = torch.tensor(x0).float() + y0 = model(x0, bbox_info) + assert y0['pred_pose'].shape == (batch_size, 24, 3, 3) + assert y0['pred_shape'].shape == (batch_size, 10) + assert y0['pred_cam'].shape == (batch_size, 3) + + # image feature from multi-layer backbone + x1_1_shape = (batch_size, 1024, 14, 14) + x1_2_shape = (batch_size, 2048, 7, 7) + x1 = [_demo_head_inputs(x1_1_shape), _demo_head_inputs(x1_2_shape)] + y1 = model(x1, bbox_info) + assert y1['pred_pose'].shape == (batch_size, 24, 3, 3) + assert y1['pred_shape'].shape == (batch_size, 10) + assert y1['pred_cam'].shape == (batch_size, 3) + + # test temporal feature + T = 16 + x_temp_shape = (batch_size, T, 1024) + x_temp = _demo_head_inputs(x_temp_shape) + with pytest.raises(NotImplementedError): + model(x_temp, bbox_info) + + # test other cases + model_wo_smpl_mean_params = CliffHead(feat_dim=2048) + assert model_wo_smpl_mean_params.init_pose.shape == (1, 144) + assert model_wo_smpl_mean_params.init_shape.shape == (1, 10) + assert model_wo_smpl_mean_params.init_cam.shape == (1, 3) + + +def _demo_head_inputs(input_shape=(1, 3, 64, 64)): + """Create a superset of inputs needed to run models. + + Args: + input_shape (tuple): input batch dimensions. + Default: (1, 3, 64, 64). + """ + features = np.random.random(input_shape) + features = torch.FloatTensor(features) + + return features diff --git a/tools/convert_datasets.py b/tools/convert_datasets.py index 83d50ffd..bd06fbcd 100644 --- a/tools/convert_datasets.py +++ b/tools/convert_datasets.py @@ -57,7 +57,7 @@ gta_human=dict(type='GTAHumanConverter', prefix='gta_human'), humman=dict( type='HuMManConverter', modes=['train', 'test'], prefix='humman'), -) + cliff=dict(type='CliffConverter', modes=['coco', 'mpii'])) def parse_args():