diff --git a/configs/keypoint/README.md b/configs/keypoint/README.md index c93932d7360f40c666b766bb7e8b8fc0390a3ea8..8b08f09204321d9b97a0105628f7e87651a9a3a5 100644 --- a/configs/keypoint/README.md +++ b/configs/keypoint/README.md @@ -72,8 +72,11 @@ COCO数据集 | LiteHRNet-18 |Top-Down| 384x288 | 69.7 | [lite_hrnet_18_384x288_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/lite_hrnet_18_384x288_coco.pdparams) | [config](./lite_hrnet/lite_hrnet_18_384x288_coco.yml) | | LiteHRNet-30 | Top-Down|256x192 | 69.4 | [lite_hrnet_30_256x192_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/lite_hrnet_30_256x192_coco.pdparams) | [config](./lite_hrnet/lite_hrnet_30_256x192_coco.yml) | | LiteHRNet-30 |Top-Down| 384x288 | 72.5 | [lite_hrnet_30_384x288_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/lite_hrnet_30_384x288_coco.pdparams) | [config](./lite_hrnet/lite_hrnet_30_384x288_coco.yml) | +|Vitpose_base_simple |Top-Down| 256x192 | 77.7 | [vitpose_base_simple_256x192_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/vitpose_base_simple_256x192_coco.pdparams) | [config](./vit_pose/vitpose_base_simple_coco_256x192.yml) | +|Vitpose_base |Top-Down| 256x192 | 78.2 | [vitpose_base_coco_256x192.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/vitpose_base_coco_256x192.pdparams) | [config](./vit_pose/vitpose_base_coco_256x192.yml) | -备注: Top-Down模型测试AP结果基于GroundTruth标注框 +备注: 1.Top-Down模型测试AP结果基于GroundTruth标注框 + 2.vitpose训练用[MAE](https://bj.bcebos.com/v1/paddledet/models/keypoint/mae_pretrain_vit_base.pdparams)做为预训练模型 MPII数据集 | 模型 | 方案| 输入尺寸 | PCKh(Mean) | PCKh(Mean@0.1) | 模型下载 | 配置文件 | @@ -284,4 +287,12 @@ python deploy/python/det_keypoint_unite_infer.py \ booktitle={CVPR}, year={2021} } + +@inproceedings{ + xu2022vitpose, + title={ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation}, + author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022}, +} ``` diff --git a/configs/keypoint/README_en.md b/configs/keypoint/README_en.md index 15f659645c39084924b11cfe8256dd1932ab8ee0..252f31745b4cf76d82a73263ea28a34aaabb9604 100644 --- a/configs/keypoint/README_en.md +++ b/configs/keypoint/README_en.md @@ -75,8 +75,11 @@ COCO Dataset | LiteHRNet-18 | 384x288 | 69.7 | [lite_hrnet_18_384x288_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/lite_hrnet_18_384x288_coco.pdparams) | [config](./lite_hrnet/lite_hrnet_18_384x288_coco.yml) | | LiteHRNet-30 | 256x192 | 69.4 | [lite_hrnet_30_256x192_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/lite_hrnet_30_256x192_coco.pdparams) | [config](./lite_hrnet/lite_hrnet_30_256x192_coco.yml) | | LiteHRNet-30 | 384x288 | 72.5 | [lite_hrnet_30_384x288_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/lite_hrnet_30_384x288_coco.pdparams) | [config](./lite_hrnet/lite_hrnet_30_384x288_coco.yml) | +| Vitpose_base_simple | 256x192 | 77.7 | [vitpose_base_simple_256x192_coco.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/vitpose_base_simple_256x192_coco.pdparams) | [config](./vit_pose/vitpose_base_simple_coco_256x192.yml) | +| Vitpose_base | 256x192 | 78.2 | [vitpose_base_coco_256x192.pdparams](https://bj.bcebos.com/v1/paddledet/models/keypoint/vitpose_base_coco_256x192.pdparams) | [config](./vit_pose/vitpose_base_coco_256x192.yml) | -Note:The AP results of Top-Down models are based on bounding boxes in GroundTruth. +Note:1.The AP results of Top-Down models are based on bounding boxes in GroundTruth. + 2.Vitpose training uses [MAE](https://bj.bcebos.com/v1/paddledet/models/keypoint/mae_pretrain_vit_base.pdparams) as the pre-training model MPII Dataset | Model | Input Size | PCKh(Mean) | PCKh(Mean@0.1) | Model Download | Config File | @@ -266,4 +269,12 @@ We provide benchmarks in different runtime environments for your reference when booktitle={CVPR}, year={2021} } + +@inproceedings{ + xu2022vitpose, + title={ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation}, + author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022}, +} ``` diff --git a/configs/keypoint/vit_pose/vitpose_base_coco_256x196.yml b/configs/keypoint/vit_pose/vitpose_base_coco_256x196.yml new file mode 100644 index 0000000000000000000000000000000000000000..3e4934e3aaa3d9f1fdf451e5b1f5b072c6bf6613 --- /dev/null +++ b/configs/keypoint/vit_pose/vitpose_base_coco_256x196.yml @@ -0,0 +1,171 @@ +use_gpu: true +log_iter: 50 +save_dir: output +snapshot_epoch: 10 +weights: output/vitpose_base_simple_coco_256x192/model_final +epoch: 210 +num_joints: &num_joints 17 +pixel_std: &pixel_std 200 +metric: KeyPointTopDownCOCOEval +num_classes: 1 +train_height: &train_height 256 +train_width: &train_width 192 +trainsize: &trainsize [*train_width, *train_height] +hmsize: &hmsize [48, 64] +flip_perm: &flip_perm [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] + + +#####model +architecture: VitPose_TopDown +pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/keypoint/mae_pretrain_vit_base.pdparams +VitPose_TopDown: + backbone: ViT + head: TopdownHeatmapSimpleHead + post_process: VitPosePostProcess + loss: KeyPointMSELoss + flip_test: True + +ViT: + img_size: [256, 192] + patch_size: 16 + embed_dim: 768 + depth: 12 + num_heads: 12 + ratio: 1 + mlp_ratio: 4 + qkv_bias: True + drop_path_rate: 0.3 + epsilon: 0.000001 + + +TopdownHeatmapSimpleHead: + in_channels: 768 + num_deconv_layers: 2 + num_deconv_filters: [256,256] + num_deconv_kernels: [4,4] + out_channels: 17 + shift_heatmap: False + flip_pairs: *flip_perm + extra: {final_conv_kernel: 1} + +VitPosePostProcess: + use_dark: True + +KeyPointMSELoss: + use_target_weight: true + loss_scale: 1.0 + +####optimizer +LearningRate: + base_lr: 0.0005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [170, 200] + - !LinearWarmup + start_factor: 0.001 + steps: 500 + +OptimizerBuilder: + clip_grad_by_norm: 1.0 + optimizer: + type: AdamWDL + betas: [0.9, 0.999] + weight_decay: 0.1 + num_layers: 12 + layer_decay: 0.75 + filter_bias_and_bn: True + skip_decay_names: ['pos_embed','norm'] + set_param_lr_func: 'layerwise_lr_decay' + + + + +#####data +TrainDataset: + !KeypointTopDownCocoDataset + image_dir: train2017 + anno_path: annotations/person_keypoints_train2017.json + dataset_dir: dataset/coco + num_joints: *num_joints + trainsize: *trainsize + pixel_std: *pixel_std + center_scale: 0.4 + + + + +EvalDataset: + !KeypointTopDownCocoDataset + image_dir: val2017 + anno_path: annotations/person_keypoints_val2017.json + dataset_dir: dataset/coco + num_joints: *num_joints + trainsize: *trainsize + pixel_std: *pixel_std + image_thre: 0.0 + use_gt_bbox: True + +TestDataset: + !ImageFolder + anno_path: dataset/coco/keypoint_imagelist.txt + +worker_num: 4 +global_mean: &global_mean [0.485, 0.456, 0.406] +global_std: &global_std [0.229, 0.224, 0.225] +TrainReader: + sample_transforms: + - RandomFlipHalfBodyTransform: + scale: 0.5 + rot: 40 + num_joints_half_body: 8 + prob_half_body: 0.3 + pixel_std: *pixel_std + trainsize: *trainsize + upper_body_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + flip_pairs: *flip_perm + + - TopDownAffine: + trainsize: *trainsize + use_udp: true + - ToHeatmapsTopDown_UDP: + hmsize: *hmsize + sigma: 2 + + batch_transforms: + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 64 + shuffle: True + drop_last: True + +EvalReader: + sample_transforms: + - TopDownAffine: + trainsize: *trainsize + use_udp: true + batch_transforms: + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 64 + +TestReader: + inputs_def: + image_shape: [3, *train_height, *train_width] + sample_transforms: + - Decode: {} + - TopDownEvalAffine: + trainsize: *trainsize + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 1 + fuse_normalize: false diff --git a/configs/keypoint/vit_pose/vitpose_base_simple_coco_256x192.yml b/configs/keypoint/vit_pose/vitpose_base_simple_coco_256x192.yml new file mode 100644 index 0000000000000000000000000000000000000000..2e34f2593979ffd05bee96c55aba0c369800178b --- /dev/null +++ b/configs/keypoint/vit_pose/vitpose_base_simple_coco_256x192.yml @@ -0,0 +1,164 @@ +use_gpu: true +log_iter: 50 +save_dir: output +snapshot_epoch: 10 +weights: output/vitpose_base_simple_coco_256x192/model_final +epoch: 210 +num_joints: &num_joints 17 +pixel_std: &pixel_std 200 +metric: KeyPointTopDownCOCOEval +num_classes: 1 +train_height: &train_height 256 +train_width: &train_width 192 +trainsize: &trainsize [*train_width, *train_height] +hmsize: &hmsize [48, 64] +flip_perm: &flip_perm [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] + + +#####model +architecture: VitPose_TopDown +pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/keypoint/mae_pretrain_vit_base.pdparams +VitPose_TopDown: + backbone: ViT + head: TopdownHeatmapSimpleHead + post_process: VitPosePostProcess + loss: KeyPointMSELoss + flip_test: True + +ViT: + img_size: [256, 192] + qkv_bias: True + drop_path_rate: 0.3 + epsilon: 0.000001 + + +TopdownHeatmapSimpleHead: + in_channels: 768 + num_deconv_layers: 0 + num_deconv_filters: [] + num_deconv_kernels: [] + upsample: 4 + shift_heatmap: False + flip_pairs: *flip_perm + extra: {final_conv_kernel: 3} + +VitPosePostProcess: + use_dark: True + +KeyPointMSELoss: + use_target_weight: true + loss_scale: 1.0 + +####optimizer +LearningRate: + base_lr: 0.0005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [170, 200] + - !LinearWarmup + start_factor: 0.001 + steps: 500 + +OptimizerBuilder: + clip_grad_by_norm: 1.0 + optimizer: + type: AdamWDL + betas: [0.9, 0.999] + weight_decay: 0.1 + num_layers: 12 + layer_decay: 0.75 + filter_bias_and_bn: True + skip_decay_names: ['pos_embed','norm'] + set_param_lr_func: 'layerwise_lr_decay' + + + + +#####data +TrainDataset: + !KeypointTopDownCocoDataset + image_dir: train2017 + anno_path: annotations/person_keypoints_train2017.json + dataset_dir: dataset/coco + num_joints: *num_joints + trainsize: *trainsize + pixel_std: *pixel_std + center_scale: 0.4 + + + +EvalDataset: + !KeypointTopDownCocoDataset + image_dir: val2017 + anno_path: annotations/person_keypoints_val2017.json + dataset_dir: dataset/coco + num_joints: *num_joints + trainsize: *trainsize + pixel_std: *pixel_std + image_thre: 0.0 + use_gt_bbox: True + +TestDataset: + !ImageFolder + anno_path: dataset/coco/keypoint_imagelist.txt + +worker_num: 4 +global_mean: &global_mean [0.485, 0.456, 0.406] +global_std: &global_std [0.229, 0.224, 0.225] +TrainReader: + sample_transforms: + - RandomFlipHalfBodyTransform: + scale: 0.5 + rot: 40 + num_joints_half_body: 8 + prob_half_body: 0.3 + pixel_std: *pixel_std + trainsize: *trainsize + upper_body_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + flip_pairs: *flip_perm + + - TopDownAffine: + trainsize: *trainsize + use_udp: true + - ToHeatmapsTopDown_UDP: + hmsize: *hmsize + sigma: 2 + + batch_transforms: + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 64 + shuffle: True + drop_last: True + +EvalReader: + sample_transforms: + - TopDownAffine: + trainsize: *trainsize + use_udp: true + batch_transforms: + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 64 + +TestReader: + inputs_def: + image_shape: [3, *train_height, *train_width] + sample_transforms: + - Decode: {} + - TopDownEvalAffine: + trainsize: *trainsize + - NormalizeImage: + mean: *global_mean + std: *global_std + is_scale: true + - Permute: {} + batch_size: 1 + fuse_normalize: false diff --git a/ppdet/data/source/keypoint_coco.py b/ppdet/data/source/keypoint_coco.py index 11ecea538404bf498c66a90f4e8293824edbf317..6e072dc6e8802942728eab31475de1ecb7bda9f3 100644 --- a/ppdet/data/source/keypoint_coco.py +++ b/ppdet/data/source/keypoint_coco.py @@ -491,7 +491,8 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset): bbox_file=None, use_gt_bbox=True, pixel_std=200, - image_thre=0.0): + image_thre=0.0, + center_scale=None): super().__init__(dataset_dir, image_dir, anno_path, num_joints, transform) @@ -500,6 +501,7 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset): self.trainsize = trainsize self.pixel_std = pixel_std self.image_thre = image_thre + self.center_scale = center_scale self.dataset_name = 'coco' def parse_dataset(self): @@ -574,6 +576,9 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset): center[1] = y + h * 0.5 aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1] + if self.center_scale is not None and np.random.rand() < 0.3: + center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h] + if w > aspect_ratio * h: h = w * 1.0 / aspect_ratio elif w < aspect_ratio * h: diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py index 8899e5c0b4cb6957810dcbce20c35f55f0dcbdf2..4c6c5ed0ac1bd5eac22c53228e7376753779a368 100644 --- a/ppdet/modeling/architectures/__init__.py +++ b/ppdet/modeling/architectures/__init__.py @@ -25,6 +25,7 @@ from . import ttfnet from . import s2anet from . import keypoint_hrhrnet from . import keypoint_hrnet +from . import keypoint_vitpose from . import jde from . import deepsort from . import fairmot @@ -55,6 +56,7 @@ from .ttfnet import * from .s2anet import * from .keypoint_hrhrnet import * from .keypoint_hrnet import * +from .keypoint_vitpose import * from .jde import * from .deepsort import * from .fairmot import * diff --git a/ppdet/modeling/architectures/keypoint_vitpose.py b/ppdet/modeling/architectures/keypoint_vitpose.py new file mode 100644 index 0000000000000000000000000000000000000000..b00226a83072c2fe7aaa1a2af92bea8a2c28ce82 --- /dev/null +++ b/ppdet/modeling/architectures/keypoint_vitpose.py @@ -0,0 +1,317 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import numpy as np +import math +import cv2 +from ppdet.core.workspace import register, create, serializable +from .meta_arch import BaseArch +from ..keypoint_utils import transform_preds +from .. import layers as L + +__all__ = ['VitPose_TopDown', 'VitPosePostProcess'] + + +@register +class VitPose_TopDown(BaseArch): + __category__ = 'architecture' + __inject__ = ['loss'] + + def __init__(self, backbone, head, loss, post_process, flip_test): + """ + VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf + + Args: + backbone (nn.Layer): backbone instance + post_process (object): `HRNetPostProcess` instance + + """ + super(VitPose_TopDown, self).__init__() + self.backbone = backbone + self.head = head + self.loss = loss + self.post_process = post_process + self.flip_test = flip_test + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + # backbone + backbone = create(cfg['backbone']) + #head + head = create(cfg['head']) + #post_process + post_process = create(cfg['post_process']) + + return { + 'backbone': backbone, + 'head': head, + 'post_process': post_process + } + + def _forward_train(self): + + feats = self.backbone.forward_features(self.inputs['image']) + vitpost_output = self.head(feats) + return self.loss(vitpost_output, self.inputs) + + def _forward_test(self): + + feats = self.backbone.forward_features(self.inputs['image']) + output_heatmap = self.head(feats) + + if self.flip_test: + img_flipped = self.inputs['image'].flip(3) + features_flipped = self.backbone.forward_features(img_flipped) + output_flipped_heatmap = self.head.inference_model(features_flipped, + self.flip_test) + + output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5 + + imshape = (self.inputs['im_shape'].numpy() + )[:, ::-1] if 'im_shape' in self.inputs else None + center = self.inputs['center'].numpy( + ) if 'center' in self.inputs else np.round(imshape / 2.) + scale = self.inputs['scale'].numpy( + ) if 'scale' in self.inputs else imshape / 200. + + result = self.post_process(output_heatmap.cpu().numpy(), center, scale) + + return result + + def get_loss(self): + return self._forward_train() + + def get_pred(self): + res_lst = self._forward_test() + outputs = {'keypoint': res_lst} + return outputs + + +@register +@serializable +class VitPosePostProcess(object): + def __init__(self, use_dark=False): + self.use_dark = use_dark + + def get_max_preds(self, heatmaps): + '''get predictions from score maps + + Args: + heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) + + Returns: + preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords + maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints + ''' + assert isinstance(heatmaps, + np.ndarray), 'heatmaps should be numpy.ndarray' + assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' + + batch_size = heatmaps.shape[0] + num_joints = heatmaps.shape[1] + width = heatmaps.shape[3] + heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1)) + idx = np.argmax(heatmaps_reshaped, 2) + maxvals = np.amax(heatmaps_reshaped, 2) + + maxvals = maxvals.reshape((batch_size, num_joints, 1)) + idx = idx.reshape((batch_size, num_joints, 1)) + + preds = np.tile(idx, (1, 1, 2)).astype(np.float32) + + preds[:, :, 0] = (preds[:, :, 0]) % width + preds[:, :, 1] = np.floor((preds[:, :, 1]) // width) + + pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) + pred_mask = pred_mask.astype(np.float32) + + preds *= pred_mask + + return preds, maxvals + + def post_datk_udp(self, coords, batch_heatmaps, kernel=3): + """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The + Devil is in the Details: Delving into Unbiased Data Processing for Human + Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate + Representation for Human Pose Estimation (CVPR 2020). + + Note: + - batch size: B + - num keypoints: K + - num persons: N + - height of heatmaps: H + - width of heatmaps: W + + B=1 for bottom_up paradigm where all persons share the same heatmap. + B=N for top_down paradigm where each person has its own heatmaps. + + Args: + coords (np.ndarray[N, K, 2]): Initial coordinates of human pose. + batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps + kernel (int): Gaussian kernel size (K) for modulation. + + Returns: + np.ndarray([N, K, 2]): Refined coordinates. + """ + if not isinstance(batch_heatmaps, np.ndarray): + batch_heatmaps = batch_heatmaps.cpu().numpy() + B, K, H, W = batch_heatmaps.shape + N = coords.shape[0] + assert (B == 1 or B == N) + for heatmaps in batch_heatmaps: + for heatmap in heatmaps: + cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap) + np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps) + np.log(batch_heatmaps, batch_heatmaps) + + batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), + (1, 1)), + mode='edge').flatten() + + index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2) + index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K) + index = index.astype(int).reshape(-1, 1) + i_ = batch_heatmaps_pad[index] + ix1 = batch_heatmaps_pad[index + 1] + iy1 = batch_heatmaps_pad[index + W + 2] + ix1y1 = batch_heatmaps_pad[index + W + 3] + ix1_y1_ = batch_heatmaps_pad[index - W - 3] + ix1_ = batch_heatmaps_pad[index - 1] + iy1_ = batch_heatmaps_pad[index - 2 - W] + + dx = 0.5 * (ix1 - ix1_) + dy = 0.5 * (iy1 - iy1_) + derivative = np.concatenate([dx, dy], axis=1) + derivative = derivative.reshape(N, K, 2, 1) + dxx = ix1 - 2 * i_ + ix1_ + dyy = iy1 - 2 * i_ + iy1_ + dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_) + hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1) + hessian = hessian.reshape(N, K, 2, 2) + hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2)) + coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze() + return coords + + def transform_preds_udp(self, + coords, + center, + scale, + output_size, + use_udp=True): + """Get final keypoint predictions from heatmaps and apply scaling and + translation to map them back to the image. + + Note: + num_keypoints: K + + Args: + coords (np.ndarray[K, ndims]): + + * If ndims=2, corrds are predicted keypoint location. + * If ndims=4, corrds are composed of (x, y, scores, tags) + * If ndims=5, corrds are composed of (x, y, scores, tags, + flipped_tags) + + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + output_size (np.ndarray[2, ] | list(2,)): Size of the + destination heatmaps. + use_udp (bool): Use unbiased data processing + + Returns: + np.ndarray: Predicted coordinates in the images. + """ + + assert coords.shape[1] in (2, 4, 5) + assert len(center) == 2 + assert len(scale) == 2 + assert len(output_size) == 2 + + # Recover the scale which is normalized by a factor of 200. + scale = scale * 200.0 + + if use_udp: + scale_x = scale[0] / (output_size[0] - 1.0) + scale_y = scale[1] / (output_size[1] - 1.0) + else: + scale_x = scale[0] / output_size[0] + scale_y = scale[1] / output_size[1] + + target_coords = np.ones_like(coords) + target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[ + 0] * 0.5 + target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[ + 1] * 0.5 + + return target_coords + + def get_final_preds(self, heatmaps, center, scale, kernelsize=11): + """the highest heatvalue location with a quarter offset in the + direction from the highest response to the second highest response. + + Args: + heatmaps (numpy.ndarray): The predicted heatmaps + center (numpy.ndarray): The boxes center + scale (numpy.ndarray): The scale factor + + Returns: + preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords + maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints + """ + coords, maxvals = self.get_max_preds(heatmaps) + + N, K, H, W = heatmaps.shape + + if self.use_dark: + coords = self.post_datk_udp(coords, heatmaps, kernelsize) + preds = coords.copy() + # Transform back to the image + for i in range(N): + preds[i] = self.transform_preds_udp(preds[i], center[i], + scale[i], [W, H]) + else: + for n in range(coords.shape[0]): + for p in range(coords.shape[1]): + hm = heatmaps[n][p] + px = int(math.floor(coords[n][p][0] + 0.5)) + py = int(math.floor(coords[n][p][1] + 0.5)) + if 1 < px < W - 1 and 1 < py < H - 1: + diff = np.array([ + hm[py][px + 1] - hm[py][px - 1], + hm[py + 1][px] - hm[py - 1][px] + ]) + coords[n][p] += np.sign(diff) * .25 + preds = coords.copy() + + # Transform back + for i in range(coords.shape[0]): + preds[i] = transform_preds(coords[i], center[i], scale[i], + [W, H]) + + return preds, maxvals + + def __call__(self, output, center, scale): + preds, maxvals = self.get_final_preds(output, center, scale) + outputs = [[ + np.concatenate( + (preds, maxvals), axis=-1), np.mean( + maxvals, axis=1) + ]] + return outputs \ No newline at end of file diff --git a/ppdet/modeling/backbones/__init__.py b/ppdet/modeling/backbones/__init__.py index f8b183e27bd2a00b2aa578469b997cb70b40cd9b..a20189c948768fcf7edef3e4b420c6fc655263a8 100644 --- a/ppdet/modeling/backbones/__init__.py +++ b/ppdet/modeling/backbones/__init__.py @@ -62,4 +62,5 @@ from .vision_transformer import * from .mobileone import * from .trans_encoder import * from .focalnet import * -from .vit_mae import * +from .vitpose import * +from .vit_mae import * \ No newline at end of file diff --git a/ppdet/modeling/backbones/vitpose.py b/ppdet/modeling/backbones/vitpose.py new file mode 100644 index 0000000000000000000000000000000000000000..23e00be1e761a7a325ecc9ad9dc03189a76306ec --- /dev/null +++ b/ppdet/modeling/backbones/vitpose.py @@ -0,0 +1,320 @@ +# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +# reference: https://arxiv.org/abs/2010.11929 + +from collections.abc import Callable + +import numpy as np +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +from ppdet.core.workspace import register, serializable + +trunc_normal_ = TruncatedNormal(std=.02) + + +def to_2tuple(x): + if isinstance(x, (list, tuple)): + return x + return tuple([x] * 2) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + + N, C = x.shape[1:] + qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // + self.num_heads)).transpose((2, 0, 3, 1, 4)) + + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm1 = norm_layer(dim) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm2 = norm_layer(dim) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + ratio=1): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + num_patches = (img_size[1] // patch_size[1]) * ( + img_size[0] // patch_size[0]) * (ratio**2) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=(patch_size[0] // ratio), + padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1))) + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + + x = self.proj(x) + return x + + +@register +@serializable +class ViT(nn.Layer): + """ Vision Transformer with support for patch input + + This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py), + the main differences are: + 1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1), + VisionTransformer dose not + 2.Attention module qkv is standard.but VisionTransformer provide more options + 3.MLP module only one Dropout,and VisionTransformer twice; + 4.VisionTransformer provide fpn layer,but the module does not. + + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer='nn.LayerNorm', + epsilon=1e-5, + ratio=1, + pretrained=None, + **kwargs): + super().__init__() + + self.pretrained = pretrained + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ratio=ratio) + num_patches = self.patch_embed.num_patches + + self.pos_embed = self.create_parameter( + shape=(1, num_patches + 1, embed_dim), + default_initializer=trunc_normal_) + self.add_parameter("pos_embed", self.pos_embed) + + dpr = np.linspace(0, drop_path_rate, depth, dtype='float32') + + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + epsilon=epsilon) for i in range(depth) + ]) + + self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon) + trunc_normal_(self.pos_embed) + self._init_weights() + + def _init_weights(self): + pretrained = self.pretrained + + if pretrained: + + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + + load_state_dict = paddle.load(path) + self.set_state_dict(load_state_dict) + print("Load load_state_dict:", path) + + def forward_features(self, x): + + B = paddle.shape(x)[0] + x = self.patch_embed(x) + B, D, Hp, Wp = x.shape + x = x.flatten(2).transpose([0, 2, 1]) + x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] + + for blk in self.blocks: + x = blk(x) + + x = self.last_norm(x) + xp = paddle.reshape( + paddle.transpose( + x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp]) + + return xp diff --git a/ppdet/modeling/heads/__init__.py b/ppdet/modeling/heads/__init__.py index 07df124cd3aeeb2b77910cee115700aed1234632..44a9fa85d19e512eb978bb9c80ab99857fc3b5ca 100644 --- a/ppdet/modeling/heads/__init__.py +++ b/ppdet/modeling/heads/__init__.py @@ -39,6 +39,7 @@ from . import yolof_head from . import ppyoloe_contrast_head from . import centertrack_head from . import sparse_roi_head +from . import vitpose_head from .bbox_head import * from .mask_head import * @@ -68,3 +69,4 @@ from .ppyoloe_contrast_head import * from .centertrack_head import * from .sparse_roi_head import * from .petr_head import * +from .vitpose_head import * \ No newline at end of file diff --git a/ppdet/modeling/heads/vitpose_head.py b/ppdet/modeling/heads/vitpose_head.py new file mode 100644 index 0000000000000000000000000000000000000000..43908ed57b1092fc2f7e84b12688857f7a16158e --- /dev/null +++ b/ppdet/modeling/heads/vitpose_head.py @@ -0,0 +1,278 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ppdet.core.workspace import register +from ppdet.modeling.keypoint_utils import resize, flip_back +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +from ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d + +trunc_normal_ = TruncatedNormal(std=.02) +normal_ = Normal(std=0.001) +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + +__all__ = ['TopdownHeatmapSimpleHead'] + + +@register +class TopdownHeatmapSimpleHead(nn.Layer): + def __init__(self, + in_channels=768, + out_channels=17, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None, + in_index=0, + input_transform=None, + align_corners=False, + upsample=0, + flip_pairs=None, + shift_heatmap=False, + target_type='GaussianHeatmap'): + super(TopdownHeatmapSimpleHead, self).__init__() + + self.in_channels = in_channels + self.upsample = upsample + self.flip_pairs = flip_pairs + self.shift_heatmap = shift_heatmap + self.target_type = target_type + + self._init_inputs(in_channels, in_index, input_transform) + self.in_index = in_index + self.align_corners = align_corners + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0, 1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + elif extra['final_conv_kernel'] == 1: + padding = 0 + else: + # 0 for Identity mapping. + identity_final_layer = True + kernel_size = extra['final_conv_kernel'] + else: + kernel_size = 1 + padding = 0 + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + conv_channels = num_deconv_filters[ + -1] if num_deconv_layers > 0 else self.in_channels + + layers = [] + if extra is not None: + num_conv_layers = extra.get('num_conv_layers', 0) + num_conv_kernels = extra.get('num_conv_kernels', + [1] * num_conv_layers) + + for i in range(num_conv_layers): + layers.append( + nn.Conv2D( + in_channels=conv_channels, + out_channels=conv_channels, + kernel_size=num_conv_kernels[i], + stride=1, + padding=(num_conv_kernels[i] - 1) // 2)) + layers.append(nn.BatchNorm2D(conv_channels)) + layers.append(nn.ReLU()) + + layers.append( + nn.Conv2D( + in_channels=conv_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + padding=(padding, padding))) + + if len(layers) > 1: + self.final_layer = nn.Sequential(*layers) + else: + self.final_layer = layers[0] + + self.init_weights() + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + """ + if not isinstance(inputs, list): + if not isinstance(inputs, list): + + if self.upsample > 0: + inputs = resize( + input=F.relu(inputs), + scale_factor=self.upsample, + mode='bilinear', + align_corners=self.align_corners) + return inputs + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = paddle.concat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + x = self.deconv_layers(x) + x = self.final_layer(x) + + return x + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output_heatmap (np.ndarray): Output heatmaps. + + Args: + x (torch.Tensor[N,K,H,W]): Input features. + flip_pairs (None | list[tuple]): + Pairs of keypoints which are mirrored. + """ + output = self.forward(x) + + if flip_pairs is not None: + output_heatmap = flip_back( + output, self.flip_pairs, target_type=self.target_type) + # feature is not aligned, shift flipped heatmap for higher accuracy + if self.shift_heatmap: + output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] + else: + output_heatmap = output + return output_heatmap + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + ConvTranspose2d( + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2D(planes)) + layers.append(nn.ReLU()) + self.in_channels = planes + + return nn.Sequential(*layers) + + def init_weights(self): + """Initialize model weights.""" + if not isinstance(self.deconv_layers, nn.Identity): + + for m in self.deconv_layers: + if isinstance(m, nn.BatchNorm2D): + ones_(m.weight) + ones_(m.bias) + if not isinstance(self.final_layer, nn.Conv2D): + + for m in self.final_layer: + if isinstance(m, nn.Conv2D): + normal_(m.weight) + zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2D): + ones_(m.weight) + ones_(m.bias) + else: + normal_(self.final_layer.weight) + zeros_(self.final_layer.bias) diff --git a/ppdet/modeling/keypoint_utils.py b/ppdet/modeling/keypoint_utils.py index d5cbeb3ba680b2cf801e541d03b2893d822579d4..377f1d75c94414e4fba92fa314ed7cc1324cc12a 100644 --- a/ppdet/modeling/keypoint_utils.py +++ b/ppdet/modeling/keypoint_utils.py @@ -17,6 +17,7 @@ this code is based on https://github.com/open-mmlab/mmpose import cv2 import numpy as np +import paddle.nn.functional as F def get_affine_mat_kernel(h, w, s, inv=False): @@ -340,3 +341,63 @@ def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): keep = keep[:keep_cnt] return keep + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 and + input_w > 1) and (output_h - 1) % (input_h - 1) and + (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + + return F.interpolate(input, size, scale_factor, mode, align_corners) + + +def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): + """Flip the flipped heatmaps back to the original form. + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + Args: + output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained + from the flipped images. + flip_pairs (list[tuple()): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + target_type (str): GaussianHeatmap or CombinedTarget + Returns: + np.ndarray: heatmaps that flipped back to the original image + """ + assert len(output_flipped.shape) == 4, \ + 'output_flipped should be [batch_size, num_keypoints, height, width]' + shape_ori = output_flipped.shape + channels = 1 + if target_type.lower() == 'CombinedTarget'.lower(): + channels = 3 + output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] + output_flipped = output_flipped.reshape((shape_ori[0], -1, channels, + shape_ori[2], shape_ori[3])) + output_flipped_back = output_flipped.clone() + + # Swap left-right parts + for left, right in flip_pairs: + output_flipped_back[:, left, ...] = output_flipped[:, right, ...] + output_flipped_back[:, right, ...] = output_flipped[:, left, ...] + output_flipped_back = output_flipped_back.reshape(shape_ori) + # Flip horizontally + output_flipped_back = output_flipped_back[..., ::-1] + return output_flipped_back diff --git a/ppdet/optimizer/adamw.py b/ppdet/optimizer/adamw.py index 6ecf676d63224c9e76df25893c83cf5b610e957a..12ab619a33623c4f2c30f539e2e1d7f30acf8c24 100644 --- a/ppdet/optimizer/adamw.py +++ b/ppdet/optimizer/adamw.py @@ -50,7 +50,7 @@ def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): layer = int(static_name[idx:].split('.')[1]) ratio = decay_rate**(n_layers - layer) - elif 'cls_token' in static_name or 'patch_embed' in static_name: + elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name: ratio = decay_rate**(n_layers + 1) if IS_PADDLE_LATER_2_4: