open-mmlab
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎README_CN.md
Lines changed: 1 addition & 0 deletions b/‎README_CN.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/hand3d/InterNet/README.md
Lines changed: 25 additions & 0 deletions b/‎configs/hand3d/InterNet/README.md
Lines changed: 25 additions & 0 deletions
diff --git a/‎configs/hand3d/InterNet/interhand3d/res50_interhand3d_all_256x256.py
Lines changed: 180 additions & 0 deletions b/‎configs/hand3d/InterNet/interhand3d/res50_interhand3d_all_256x256.py
Lines changed: 180 additions & 0 deletions
diff --git a/‎mmpose/core/evaluation/__init__.py
Lines changed: 5 additions & 2 deletions b/‎mmpose/core/evaluation/__init__.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎mmpose/core/evaluation/top_down_eval.py
Lines changed: 98 additions & 0 deletions b/‎mmpose/core/evaluation/top_down_eval.py
Lines changed: 98 additions & 0 deletions
diff --git a/‎mmpose/datasets/datasets/hand/interhand3d_dataset.py
Lines changed: 4 additions & 4 deletions b/‎mmpose/datasets/datasets/hand/interhand3d_dataset.py
Lines changed: 4 additions & 4 deletions
@@ -80,6 +80,7 @@ Supported algorithms:
 - [x] [RSN](configs/top_down/rsn/README.md) (ECCV'2020)
 - [x] [HMR](configs/mesh/hmr/README.md) (CVPR'2018)
 - [x] [Simple 3D Baseline](configs/body3d/simple_baseline/README.md) (ICCV'2017)
+- [x] [InterNet](configs/hand3d/InterNet/README.md) (ECCV'2020)
 
 </details>
 
 
@@ -81,6 +81,7 @@ MMPose 是一款基于 PyTorch 的姿态分析的开源工具箱，是 [OpenMMLa
 - [x] [RSN](configs/top_down/rsn/README.md) (ECCV'2020)
 - [x] [HMR](configs/mesh/hmr/README.md) (CVPR'2018)
 - [x] [Simple 3D Baseline](configs/body3d/simple_baseline/README.md) (ICCV'2017)
+- [x] [InterNet](configs/hand3d/InterNet/README.md) (ECCV'2020)
 
 </details>
 
 
@@ -0,0 +1,25 @@
+# InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+## Results and models
+
+### 3d Hand Pose Estimation
+
+#### Results on InterHand2.6M val & test set
+
+|Train Set| Set | Arch  | Input Size | MPJPE-single |  MPJPE-interacting  |  MPJPE-all  | MRRPE | APh   | ckpt    | log     |
+| :--- | :--- | :--------: | :--------: | :------: | :------: | :------: |:------: |:------: |:------: |:------: |
+| All | test(H+M) | [InterNet_resnet_50](/configs/hand3d/InterNet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 10.16 | 15.27 | 12.97 | 33.14 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3d_all_256x256-b9c1cf4c_20210506.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3d_all_256x256_20210506.log.json) |
+| All | val(M) | [InterNet_resnet_50](/configs/hand3d/InterNet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 12.03 | 17.88 | 14.84 | 34.93 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3d_all_256x256-b9c1cf4c_20210506.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3d_all_256x256_20210506.log.json) |
@@ -0,0 +1,180 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=1)
+evaluation = dict(
+    interval=1,
+    metric=['MRRPE', 'MPJPE', 'Handedness_acc'],
+    key_indicator='MPJPE_all')
+
+optimizer = dict(
+    type='Adam',
+    lr=2e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[15, 17])
+total_epochs = 20
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=42,
+    dataset_joints=42,
+    dataset_channel=[list(range(42))],
+    inference_channel=list(range(42)))
+
+# model settings
+model = dict(
+    type='Interhand3D',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', depth=50),
+    keypoint_head=dict(
+        type='Interhand3DHead',
+        keypoint_head_cfg=dict(
+            in_channels=2048,
+            out_channels=21 * 64,
+            depth_size=64,
+            num_deconv_layers=3,
+            num_deconv_filters=(256, 256, 256),
+            num_deconv_kernels=(4, 4, 4),
+        ),
+        root_head_cfg=dict(
+            in_channels=2048,
+            heatmap_size=64,
+            hidden_dims=(512, ),
+        ),
+        hand_type_head_cfg=dict(
+            in_channels=2048,
+            num_labels=2,
+            hidden_dims=(512, ),
+        ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True),
+        loss_root_depth=dict(type='L1Loss'),
+        loss_hand_type=dict(type='BCELoss', use_target_weight=True),
+    ),
+    train_cfg={},
+    test_cfg=dict(flip_test=False))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64, 64],
+    heatmap3d_depth_bound=400.0,
+    heatmap_size_root=64,
+    root_depth_bound=400.0,
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='HandRandomFlip', flip_prob=0.5),
+    dict(type='TopDownRandomTranslation', trans_factor=0.15),
+    dict(
+        type='TopDownGetRandomScaleRotation',
+        rot_factor=45,
+        scale_factor=0.25,
+        rot_prob=0.6),
+    # dict(type='MeshRandomChannelNoise', noise_factor=0.2),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='MultitaskGatherTarget',
+        pipeline_list=[
+            [dict(
+                type='Generate3DHeatmapTarget',
+                sigma=2.5,
+                max_bound=255,
+            )], [dict(type='HandGenerateRelDepthTarget')],
+            [
+                dict(
+                    type='RenameKeys',
+                    key_pairs=[('hand_type', 'target'),
+                               ('hand_type_valid', 'target_weight')])
+            ]
+        ],
+        pipeline_indices=[0, 1, 2],
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'flip_pairs',
+            'heatmap3d_depth_bound', 'root_depth_bound'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/interhand2.6m'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=2,
+    train=dict(
+        type='InterHand3DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_train_joint_3d.json',
+        img_prefix=f'{data_root}/images/train/',
+        data_cfg=data_cfg,
+        use_gt_root_depth=True,
+        rootnet_result_file=None,
+        pipeline=train_pipeline),
+    val=dict(
+        type='InterHand3DDataset',
+        ann_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_data.json',
+        camera_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_camera.json',
+        joint_file=f'{data_root}/annotations/machine_annot/'
+        'InterHand2.6M_val_joint_3d.json',
+        img_prefix=f'{data_root}/images/val/',
+        data_cfg=data_cfg,
+        use_gt_root_depth=True,
+        rootnet_result_file=None,
+        pipeline=val_pipeline),
+    test=dict(
+        type='InterHand3DDataset',
+        ann_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_data.json',
+        camera_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_camera.json',
+        joint_file=f'{data_root}/annotations/all/'
+        'InterHand2.6M_test_joint_3d.json',
+        img_prefix=f'{data_root}/images/test/',
+        data_cfg=data_cfg,
+        use_gt_root_depth=True,
+        rootnet_result_file=None,
+        pipeline=val_pipeline),
+)
@@ -4,13 +4,16 @@
 from .mesh_eval import compute_similarity_transform
 from .pose3d_eval import keypoint_mpjpe
 from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_pck_accuracy,
-                            keypoints_from_heatmaps, keypoints_from_regression,
+                            keypoints_from_heatmaps, keypoints_from_heatmaps3d,
+                            keypoints_from_regression,
+                            multilabel_classification_accuracy,
                             pose_pck_accuracy, post_dark_udp)
 
 __all__ = [
     'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps',
     'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_auc',
     'keypoint_epe', 'get_group_preds', 'get_multi_stage_outputs',
     'aggregate_results', 'compute_similarity_transform', 'post_dark_udp',
-    'keypoint_mpjpe'
+    'keypoint_mpjpe', 'keypoints_from_heatmaps3d',
+    'multilabel_classification_accuracy'
 ]
@@ -91,6 +91,43 @@ def _get_max_preds(heatmaps):
     return preds, maxvals
 
 
+def _get_max_preds_3d(heatmaps):
+    """Get keypoint predictions from 3D score maps.
+
+    Note:
+        batch size: N
+        num keypoints: K
+        heatmap depth size: D
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+        - preds (np.ndarray[N, K, 3]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps, np.ndarray), \
+        ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim'
+
+    N, K, D, H, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.zeros((N, K, 3), dtype=np.float32)
+    _idx = idx[..., 0]
+    preds[..., 2] = _idx // (H * W)
+    preds[..., 1] = (_idx // W) % H
+    preds[..., 0] = _idx % W
+
+    preds = np.where(maxvals > 0.0, preds, -1)
+    return preds, maxvals
+
+
 def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None):
     """Calculate the pose accuracy of PCK for each individual keypoint and the
     averaged accuracy across all keypoints from heatmaps.
@@ -574,3 +611,64 @@ def keypoints_from_heatmaps(heatmaps,
         maxvals = maxvals / 255.0 + 0.5
 
     return preds, maxvals
+
+
+def keypoints_from_heatmaps3d(heatmaps, center, scale):
+    """Get final keypoint predictions from 3d heatmaps and transform them back
+    to the image.
+
+    Note:
+        batch size: N
+        num keypoints: K
+        heatmap depth size: D
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location
+        in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, D, H, W = heatmaps.shape
+    preds, maxvals = _get_max_preds_3d(heatmaps)
+    # Transform back to the image
+    for i in range(N):
+        preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i],
+                                          [W, H])
+    return preds, maxvals
+
+
+def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
+    """Get multi-label classification accuracy.
+    Notes:
+        batch size: N
+        label number: L
+
+    Args:
+        pred (np.ndarray[N, L, 2]): model predicted labels.
+        gt (np.ndarray[N, L, 2]): ground-truth labels.
+        mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of
+        ground-truth labels.
+
+    Returns:
+        acc (float): multi-label classification accuracy.
+    """
+    # we only compute accuracy on the samples with ground-truth of all labels.
+    valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0)
+    pred, gt = pred[valid], gt[valid]
+
+    if pred.shape[0] == 0:
+        acc = 0  # when no sample is with gt labels, set acc to 0.
+    else:
+        # The classification of a sample is regarded as correct
+        # only if it's correct for all labels.
+        acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean()
+    return acc
@@ -188,11 +188,11 @@ def _pixel2cam(pixel_coord, f, c):
     @staticmethod
     def _encode_handtype(hand_type):
         if hand_type == 'right':
-            return np.array([1, 0], dtype=int)
+            return np.array([1, 0], dtype=np.float32)
         elif hand_type == 'left':
-            return np.array([0, 1], dtype=int)
+            return np.array([0, 1], dtype=np.float32)
         elif hand_type == 'interacting':
-            return np.array([1, 1], dtype=int)
+            return np.array([1, 1], dtype=np.float32)
         else:
             assert 0, f'Not support hand type: {hand_type}'
 
@@ -375,7 +375,7 @@ def evaluate(self, outputs, res_folder, metric='MPJPE', **kwargs):
                 }
 
                 if preds is not None:
-                    kpt['keypoints'] = preds[i].tolist()
+                    kpt['keypoints'] = preds[i, :, :3].tolist()
                 if hand_type is not None:
                     kpt['hand_type'] = hand_type[i][0:2].tolist()
                     kpt['hand_type_score'] = hand_type[i][2:4].tolist()