From 0bf1c25c8e9dc824a7c4c06275f84f718c47e06b Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Wed, 8 Feb 2023 16:53:46 +0800 Subject: [PATCH] add metro3d config file (#7703) * best mpjpe 55 * rename configfile * replace api --- configs/pose3d/metro3d_24kpts.yml | 144 +++++++++++++++++++ ppdet/data/source/pose3d_cmb.py | 13 +- ppdet/metrics/pose3d_metrics.py | 7 +- ppdet/modeling/architectures/pose3d_metro.py | 19 +-- ppdet/modeling/losses/pose3d_loss.py | 56 ++++++-- ppdet/utils/visualizer.py | 20 +-- 6 files changed, 219 insertions(+), 40 deletions(-) create mode 100644 configs/pose3d/metro3d_24kpts.yml diff --git a/configs/pose3d/metro3d_24kpts.yml b/configs/pose3d/metro3d_24kpts.yml new file mode 100644 index 000000000..b8ea08a23 --- /dev/null +++ b/configs/pose3d/metro3d_24kpts.yml @@ -0,0 +1,144 @@ +use_gpu: True +log_iter: 20 +save_dir: output +snapshot_epoch: 3 +weights: output/metro_modified/model_final +epoch: 50 +metric: Pose3DEval +num_classes: 1 +train_height: &train_height 224 +train_width: &train_width 224 +trainsize: &trainsize [*train_width, *train_height] +num_joints: &num_joints 24 + +#####model +architecture: METRO_Body +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/Trunc_HRNet_W32_C_pretrained.pdparams + +METRO_Body: + backbone: HRNet + trans_encoder: TransEncoder + num_joints: *num_joints + loss: Pose3DLoss + +HRNet: + width: 32 + freeze_at: -1 + freeze_norm: False + norm_momentum: 0.1 + downsample: True + +TransEncoder: + vocab_size: 30522 + num_hidden_layers: 4 + num_attention_heads: 4 + position_embeddings_size: 512 + intermediate_size: 3072 + input_feat_dim: [2048, 512, 128] + hidden_feat_dim: [1024, 256, 128] + attention_probs_dropout_prob: 0.1 + fc_dropout_prob: 0.1 + act_fn: 'gelu' + output_attentions: False + output_hidden_feats: False + +Pose3DLoss: + weight_3d: 1.0 + weight_2d: 0.0 + +#####optimizer +LearningRate: + base_lr: 0.0001 + schedulers: + - !CosineDecay + max_epochs: 52 + - !LinearWarmup + start_factor: 0.01 + steps: 2000 + + +OptimizerBuilder: + clip_grad_by_norm: 0.2 + optimizer: + type: Adam + regularizer: + factor: 0.0 + type: L2 + + +#####data +TrainDataset: + !Pose3DDataset + dataset_dir: dataset/traindata/ + image_dirs: ["human3.6m", "posetrack3d", "hr-lspet", "hr-lspet", "mpii/images", "coco/train2017"] + anno_list: ["pose3d/Human3.6m_train.json", "pose3d/PoseTrack_ver01.json", "pose3d/LSPet_train_ver10.json", "pose3d/LSPet_test_ver10.json", "pose3d/MPII_ver01.json", "pose3d/COCO2014-All-ver01.json"] + num_joints: *num_joints + test_mode: False + +EvalDataset: + !Pose3DDataset + dataset_dir: dataset/traindata/ + image_dirs: ["human3.6m"] + anno_list: ["pose3d/Human3.6m_valid.json"] + num_joints: *num_joints + test_mode: True + +TestDataset: + !ImageFolder + anno_path: dataset/traindata/coco/keypoint_imagelist.txt + +worker_num: 4 +global_mean: &global_mean [0.485, 0.456, 0.406] +global_std: &global_std [0.229, 0.224, 0.225] +TrainReader: + sample_transforms: + - SinglePoseAffine: + trainsize: *trainsize + rotate: [1.0, 30] #[prob, rotate range] + scale: [1.0, 0.25] #[prob, scale range] + - FlipPose: + flip_prob: 0.5 + img_res: *train_width + num_joints: *num_joints + - NoiseJitter: + noise_factor: 0.4 + batch_transforms: + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 64 + shuffle: true + drop_last: true + +EvalReader: + sample_transforms: + - SinglePoseAffine: + trainsize: *trainsize + rotate: [0., 30] + scale: [0., 0.25] + batch_transforms: + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 16 + shuffle: false + drop_last: false + +TestReader: + inputs_def: + image_shape: [3, *train_height, *train_width] + sample_transforms: + - Decode: {} + - TopDownEvalAffine: + trainsize: *trainsize + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 1 + fuse_normalize: false #whether to fuse nomalize layer into model while export model diff --git a/ppdet/data/source/pose3d_cmb.py b/ppdet/data/source/pose3d_cmb.py index ab7123aec..ea89daf01 100644 --- a/ppdet/data/source/pose3d_cmb.py +++ b/ppdet/data/source/pose3d_cmb.py @@ -77,10 +77,12 @@ class Pose3DDataset(DetDataset): indices = np.random.choice( np.arange(num_joints), replace=False, size=masked_num) mjm_mask[indices, :] = 0.0 + # return mjm_mask - mvm_mask = np.ones((10, 1)).astype(np.float32) + num_joints = 1 + mvm_mask = np.ones((num_joints, 1)).astype(np.float) if self.test_mode == False: - num_vertices = 10 + num_vertices = num_joints pb = np.random.random_sample() masked_num = int( pb * mvm_percent * @@ -108,6 +110,7 @@ class Pose3DDataset(DetDataset): print("Loading annotations..., please wait") self.annos = [] im_id = 0 + self.human36m_num = 0 for idx, annof in enumerate(self.anno_list): img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx]) dataf = os.path.join(self.dataset_dir, annof) @@ -138,6 +141,8 @@ class Pose3DDataset(DetDataset): print("cannot find imagepath:{}".format(imagename)) continue new_anno['imageName'] = imagename + if 'human3.6m' in imagename: + self.human36m_num += 1 new_anno['bbox_center'] = anno['bbox_center'] new_anno['bbox_scale'] = anno['bbox_scale'] new_anno['joints_2d'] = np.array(anno[ @@ -160,6 +165,10 @@ class Pose3DDataset(DetDataset): self.annos.append(new_anno) del annos + def get_temp_num(self): + """get temporal data number, like human3.6m""" + return self.human36m_num + def __len__(self): """Get dataset length.""" return len(self.annos) diff --git a/ppdet/metrics/pose3d_metrics.py b/ppdet/metrics/pose3d_metrics.py index 45b9239a5..32e1deb61 100644 --- a/ppdet/metrics/pose3d_metrics.py +++ b/ppdet/metrics/pose3d_metrics.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +from paddle.distributed import ParallelEnv import os import json from collections import defaultdict, OrderedDict @@ -161,8 +162,10 @@ class Pose3DEval(object): return paddle.index_select(input, J24_TO_J14, axis=1) def update(self, inputs, outputs): - gt_3d_joints = all_gather(inputs['joints_3d']) - has_3d_joints = all_gather(inputs['has_3d_joints']) + gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv() + .local_rank)) + has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv() + .local_rank)) pred_3d_joints = all_gather(outputs['pose3d']) if gt_3d_joints.shape[1] == 24: gt_3d_joints = self.get_human36m_joints(gt_3d_joints) diff --git a/ppdet/modeling/architectures/pose3d_metro.py b/ppdet/modeling/architectures/pose3d_metro.py index 9e66bd78f..b56280981 100644 --- a/ppdet/modeling/architectures/pose3d_metro.py +++ b/ppdet/modeling/architectures/pose3d_metro.py @@ -65,10 +65,8 @@ class METRO_Body(BaseArch): self.deploy = False self.trans_encoder = trans_encoder - self.conv_learn_tokens = paddle.nn.Conv1D(49, 10 + num_joints, 1) - self.cam_param_fc = paddle.nn.Linear(3, 1) - self.cam_param_fc2 = paddle.nn.Linear(10, 250) - self.cam_param_fc3 = paddle.nn.Linear(250, 3) + self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 1, 1) + self.cam_param_fc = paddle.nn.Linear(3, 2) @classmethod def from_config(cls, cfg, *args, **kwargs): @@ -85,7 +83,7 @@ class METRO_Body(BaseArch): image_feat_flatten = image_feat.reshape((batch_size, 2048, 49)) image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1)) # and apply a conv layer to learn image token for each 3d joint/vertex position - features = self.conv_learn_tokens(image_feat_flatten) + features = self.conv_learn_tokens(image_feat_flatten) # (B, J, C) if self.training: # apply mask vertex/joint modeling @@ -95,20 +93,13 @@ class METRO_Body(BaseArch): constant_tensor = paddle.ones_like(features) * 0.01 features = features * meta_masks + constant_tensor * (1 - meta_masks ) - pred_out = self.trans_encoder(features) + pred_3d_joints = pred_out[:, :self.num_joints, :] cam_features = pred_out[:, self.num_joints:, :] # learn camera parameters - x = self.cam_param_fc(cam_features) - x = x.transpose(perm=(0, 2, 1)) - x = self.cam_param_fc2(x) - x = self.cam_param_fc3(x) - cam_param = x.transpose(perm=(0, 2, 1)) - pred_camera = cam_param.squeeze() - pred_2d_joints = orthographic_projection(pred_3d_joints, pred_camera) - + pred_2d_joints = self.cam_param_fc(cam_features) return pred_3d_joints, pred_2d_joints def get_loss(self): diff --git a/ppdet/modeling/losses/pose3d_loss.py b/ppdet/modeling/losses/pose3d_loss.py index 2b98508f4..4781d6e5c 100644 --- a/ppdet/modeling/losses/pose3d_loss.py +++ b/ppdet/modeling/losses/pose3d_loss.py @@ -20,8 +20,11 @@ from itertools import cycle, islice from collections import abc import paddle import paddle.nn as nn +import paddle.nn.functional as F from ppdet.core.workspace import register, serializable +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') __all__ = ['Pose3DLoss'] @@ -42,7 +45,7 @@ class Pose3DLoss(nn.Layer): self.weight_3d = weight_3d self.weight_2d = weight_2d self.criterion_2dpose = nn.MSELoss(reduction=reduction) - self.criterion_3dpose = nn.MSELoss(reduction=reduction) + self.criterion_3dpose = nn.L1Loss(reduction=reduction) self.criterion_smoothl1 = nn.SmoothL1Loss( reduction=reduction, delta=1.0) self.criterion_vertices = nn.L1Loss() @@ -57,10 +60,17 @@ class Pose3DLoss(nn.Layer): has_3d_joints = inputs['has_3d_joints'] has_2d_joints = inputs['has_2d_joints'] - loss_3d = mpjpe(pred3d, gt_3d_joints, has_3d_joints) - loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d, gt_2d_joints, - has_2d_joints) - return self.weight_3d * loss_3d + self.weight_2d * loss_2d + loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints) + loss = self.weight_3d * loss_3d + epoch = inputs['epoch_id'] + if self.weight_2d > 0: + weight = self.weight_2d * pow(0.1, (epoch // 8)) + if epoch > 8: + weight = 0 + loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d, + gt_2d_joints, has_2d_joints) + loss += weight * loss_2d + return loss def filter_3d_joints(pred, gt, has_3d_joints): @@ -78,25 +88,45 @@ def filter_3d_joints(pred, gt, has_3d_joints): return pred, gt -@register -@serializable def mpjpe(pred, gt, has_3d_joints): """ mPJPE loss """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean() + error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2 + ).sum(axis=-1)).mean() + return error + + +def mpjpe_focal(pred, gt, has_3d_joints): + """ + mPJPE loss + """ + pred, gt = filter_3d_joints(pred, gt, has_3d_joints) + mse_error = ((pred - gt)**2).sum(axis=-1) + mpjpe_error = paddle.sqrt(mse_error) + mean = mpjpe_error.mean() + std = mpjpe_error.std() + atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std) + mse_error *= atte + return mse_error.mean() + + +def mpjpe_mse(pred, gt, has_3d_joints, weight=1.): + """ + mPJPE loss + """ + pred, gt = filter_3d_joints(pred, gt, has_3d_joints) + error = (((pred - gt)**2).sum(axis=-1)).mean() return error -@register -@serializable def mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d): """ mPJPE loss of self define criterion """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - error = paddle.sqrt(criterion_pose3d(pred, gt).sum(axis=-1)).mean() + error = paddle.sqrt(criterion_pose3d(pred, gt)).mean() return error @@ -165,8 +195,8 @@ def keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d, The confidence (conf) is binary and indicates whether the keypoints exist or not. """ conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone() - loss = (conf * criterion_keypoints(pred_keypoints_2d, - gt_keypoints_2d[:, :, :-1])).mean() + loss = (conf * criterion_keypoints( + pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean() return loss diff --git a/ppdet/utils/visualizer.py b/ppdet/utils/visualizer.py index 135180854..f7193306c 100644 --- a/ppdet/utils/visualizer.py +++ b/ppdet/utils/visualizer.py @@ -50,7 +50,8 @@ def visualize_results(image, if keypoint_res is not None: image = draw_pose(image, keypoint_res, threshold) if pose3d_res is not None: - image = draw_pose3d(image, pose3d_res, threshold) + pose3d = np.array(pose3d_res[0]['pose3d']) * 1000 + image = draw_pose3d(image, pose3d, visual_thread=threshold) return image @@ -325,12 +326,11 @@ def draw_pose(image, def draw_pose3d(image, - results, + pose3d, + pose2d=None, visual_thread=0.6, save_name='pose3d.jpg', - save_dir='output', - returnimg=False, - ids=None): + returnimg=True): try: import matplotlib.pyplot as plt import matplotlib @@ -339,12 +339,11 @@ def draw_pose3d(image, logger.error('Matplotlib not found, please install matplotlib.' 'for example: `pip install matplotlib`.') raise e - pose3d = np.array(results[0]['pose3d']) * 1000 if pose3d.shape[0] == 24: joints_connectivity_dict = [ [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1], - [3, 14, 1], [14, 15, 1], [15, 16, 1], [16, 12, 1], [6, 7, 0], + [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1], [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0], [23, 21, 1] @@ -450,6 +449,9 @@ def draw_pose3d(image, image = Image.frombytes("RGBA", (w, h), buf.tostring()) return image.convert("RGB") - fig = draw_img_pose(pose3d, frame=image) + fig = draw_img_pose(pose3d, pose2d, frame=image) data = fig2data(fig) - return data + if returnimg is False: + data.save(save_name) + else: + return data -- GitLab