未验证 提交 a694be1e 编写于 作者: Y Yang Nie 提交者: GitHub

【Hackathon + No.163】基于PaddleDetection PP-TinyPose,新增手势关键点检测模型 (#8066)

* support COCO Whole Bady Hand

* update transforms

* disable `AugmentationbyInformantionDropping`

* fix infer bug

* fix getImgIds
上级 eeebef9f
use_gpu: true
log_iter: 5
save_dir: output
snapshot_epoch: 10
weights: output/tinypose_256x256_hand/model_final
epoch: 210
num_joints: &num_joints 21
pixel_std: &pixel_std 200
metric: KeyPointTopDownCOCOWholeBadyHandEval
num_classes: 1
train_height: &train_height 256
train_width: &train_width 256
trainsize: &trainsize [*train_width, *train_height]
hmsize: &hmsize [64, 64]
flip_perm: &flip_perm []
architecture: TopDownHRNet
backbone: LiteHRNet
post_process: HRNetPostProcess
flip_perm: *flip_perm
num_joints: *num_joints
width: &width 40
loss: KeyPointMSELoss
use_dark: true
network_type: wider_naive
freeze_at: -1
freeze_norm: false
return_idx: [0]
use_target_weight: true
loss_scale: 1.0
base_lr: 0.002
- !PiecewiseDecay
milestones: [170, 200]
gamma: 0.1
- !LinearWarmup
start_factor: 0.001
steps: 500
type: Adam
factor: 0.0
type: L2
image_dir: train2017
anno_path: annotations/coco_wholebody_train_v1.0.json
dataset_dir: dataset/coco
num_joints: *num_joints
trainsize: *trainsize
pixel_std: *pixel_std
image_dir: val2017
anno_path: annotations/coco_wholebody_val_v1.0.json
dataset_dir: dataset/coco
num_joints: *num_joints
trainsize: *trainsize
pixel_std: *pixel_std
anno_path: dataset/coco/keypoint_imagelist.txt
worker_num: 2
global_mean: &global_mean [0.485, 0.456, 0.406]
global_std: &global_std [0.229, 0.224, 0.225]
- TopDownRandomShiftBboxCenter:
shift_prob: 0.3
shift_factor: 0.16
- TopDownRandomFlip:
flip_prob: 0.5
flip_perm: *flip_perm
- TopDownGetRandomScaleRotation:
rot_prob: 0.6
rot_factor: 90
scale_factor: 0.3
# - AugmentationbyInformantionDropping:
# prob_cutout: 0.5
# offset_factor: 0.05
# num_patch: 1
# trainsize: *trainsize
- TopDownAffine:
trainsize: *trainsize
use_udp: true
- ToHeatmapsTopDown_DARK:
hmsize: *hmsize
sigma: 2
- NormalizeImage:
mean: *global_mean
std: *global_std
is_scale: true
- Permute: {}
batch_size: 128
shuffle: true
drop_last: false
- TopDownAffine:
trainsize: *trainsize
use_udp: true
- NormalizeImage:
mean: *global_mean
std: *global_std
is_scale: true
- Permute: {}
batch_size: 128
image_shape: [3, *train_height, *train_width]
- Decode: {}
- TopDownEvalAffine:
trainsize: *trainsize
- NormalizeImage:
mean: *global_mean
std: *global_std
is_scale: true
- Permute: {}
batch_size: 1
fuse_normalize: false
......@@ -114,8 +114,10 @@ def get_categories(metric_type, anno_file=None, arch=None):
elif metric_type.lower() == 'widerface':
return _widerface_category()
elif metric_type.lower() == 'keypointtopdowncocoeval' or metric_type.lower(
) == 'keypointtopdownmpiieval':
elif metric_type.lower() in [
'keypointtopdowncocoeval', 'keypointtopdownmpiieval',
return (None, {'id': 'keypoint'})
elif metric_type.lower() == 'pose3deval':
......@@ -635,6 +635,122 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
return kpt_db
class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset):
"""CocoWholeBody dataset for top-down hand pose estimation.
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
COCO-WholeBody Hand keypoint indexes:
0: 'wrist',
1: 'thumb1',
2: 'thumb2',
3: 'thumb3',
4: 'thumb4',
5: 'forefinger1',
6: 'forefinger2',
7: 'forefinger3',
8: 'forefinger4',
9: 'middle_finger1',
10: 'middle_finger2',
11: 'middle_finger3',
12: 'middle_finger4',
13: 'ring_finger1',
14: 'ring_finger2',
15: 'ring_finger3',
16: 'ring_finger4',
17: 'pinky_finger1',
18: 'pinky_finger2',
19: 'pinky_finger3',
20: 'pinky_finger4'
dataset_dir (str): Root path to the dataset.
image_dir (str): Path to a directory where images are held.
anno_path (str): Relative path to the annotation file.
num_joints (int): Keypoint numbers
trainsize (list):[w, h] Image target size
transform (composed(operators)): A sequence of data transforms.
pixel_std (int): The pixel std of the scale
Default: 200.
def __init__(self,
super().__init__(dataset_dir, image_dir, anno_path, num_joints,
self.trainsize = trainsize
self.pixel_std = pixel_std
self.dataset_name = 'coco_wholebady_hand'
def _box2cs(self, box):
x, y, w, h = box[:4]
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
scale = np.array(
[w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
if center[0] != -1:
scale = scale * 1.25
return center, scale
def parse_dataset(self):
gt_db = []
num_joints = self.ann_info['num_joints']
coco = COCO(self.get_anno())
img_ids = list(coco.imgs.keys())
for img_id in img_ids:
im_ann = coco.loadImgs(img_id)[0]
image_file = os.path.join(self.img_prefix, im_ann['file_name'])
im_id = int(im_ann["id"])
ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = coco.loadAnns(ann_ids)
for obj in objs:
for type in ['left', 'right']:
if (obj[f'{type}hand_valid'] and
max(obj[f'{type}hand_kpts']) > 0):
joints = np.zeros((num_joints, 3), dtype=np.float32)
joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
keypoints = np.array(obj[f'{type}hand_kpts'])
keypoints = keypoints.reshape(-1, 3)
joints[:, :2] = keypoints[:, :2]
joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
center, scale = self._box2cs(obj[f'{type}hand_box'][:4])
'image_file': image_file,
'center': center,
'scale': scale,
'gt_joints': joints,
'joints_vis': joints_vis,
'im_id': im_id,
self.db = gt_db
class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
......@@ -38,6 +38,7 @@ registered_ops = []
__all__ = [
'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation',
'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',
'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',
'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',
......@@ -687,6 +688,134 @@ class AugmentationbyInformantionDropping(object):
return records
class TopDownRandomFlip(object):
"""Data augmentation with random image flip.
flip_perm: (list[tuple]): Pairs of keypoints which are mirrored
(for example, left ear and right ear).
flip_prob (float): Probability of flip.
def __init__(self, flip_perm=[], flip_prob=0.5):
self.flip_perm = flip_perm
self.flip_prob = flip_prob
def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs):
assert len(joints_3d) == len(joints_3d_visible)
assert img_width > 0
joints_3d_flipped = joints_3d.copy()
joints_3d_visible_flipped = joints_3d_visible.copy()
# Swap left-right parts
for left, right in flip_pairs:
joints_3d_flipped[left, :] = joints_3d[right, :]
joints_3d_flipped[right, :] = joints_3d[left, :]
joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
# Flip horizontally
joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0)
return joints_3d_flipped, joints_3d_visible_flipped
def __call__(self, results):
"""Perform data augmentation with random image flip."""
if np.random.rand() <= self.flip_prob:
return results
img = results['image']
joints_3d = results['gt_joints']
joints_3d_visible = results['joints_vis']
center = results['center']
# A flag indicating whether the image is flipped,
# which can be used by child class.
if not isinstance(img, list):
img = img[:, ::-1, :]
img = [i[:, ::-1, :] for i in img]
if not isinstance(img, list):
joints_3d, joints_3d_visible = self.flip_joints(
joints_3d, joints_3d_visible, img.shape[1],
center[0] = img.shape[1] - center[0] - 1
joints_3d, joints_3d_visible = self.flip_joints(
joints_3d, joints_3d_visible, img[0].shape[1],
center[0] = img[0].shape[1] - center[0] - 1
results['image'] = img
results['gt_joints'] = joints_3d
results['joints_vis'] = joints_3d_visible
results['center'] = center
return results
class TopDownRandomShiftBboxCenter(object):
"""Random shift the bbox center.
shift_factor (float): The factor to control the shift range, which is
scale*pixel_std*scale_factor. Default: 0.16
shift_prob (float): Probability of applying random shift. Default: 0.3
def __init__(self, shift_factor=0.16, shift_prob=0.3):
self.shift_factor = shift_factor
self.shift_prob = shift_prob
def __call__(self, results):
center = results['center']
scale = results['scale']
if np.random.rand() < self.shift_prob:
center += np.random.uniform(
-1, 1, 2) * self.shift_factor * scale * 200.0
results['center'] = center
return results
class TopDownGetRandomScaleRotation(object):
"""Data augmentation with random scaling & rotating.
rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
rot_prob (float): Probability of random rotation.
def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):
self.rot_factor = rot_factor
self.scale_factor = scale_factor
self.rot_prob = rot_prob
def __call__(self, results):
"""Perform data augmentation with random scaling & rotating."""
s = results['scale']
sf = self.scale_factor
rf = self.rot_factor
s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
s = s * s_factor
r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
r = r_factor if np.random.rand() <= self.rot_prob else 0
results['scale'] = s
results['rotate'] = r
return results
class TopDownAffine(object):
"""apply affine transform to image and coords
......@@ -38,8 +38,8 @@ from ppdet.optimizer import ModelEMA
from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from ppdet.utils.visualizer import visualize_results, save_result
from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval, Pose3DEval
from ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
from ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval
from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
from ppdet.data.source.sniper_coco import SniperCOCODataSet
from ppdet.data.source.category import get_categories
import ppdet.utils.stats as stats
......@@ -348,6 +348,19 @@ class Trainer(object):
elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval':
eval_dataset = self.cfg['EvalDataset']
anno_file = eval_dataset.get_anno()
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
elif self.cfg.metric == 'KeyPointTopDownMPIIEval':
eval_dataset = self.cfg['EvalDataset']
......@@ -19,12 +19,15 @@ import numpy as np
import paddle
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from ..modeling.keypoint_utils import oks_nms
from ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe
from scipy.io import loadmat, savemat
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = ['KeyPointTopDownCOCOEval', 'KeyPointTopDownMPIIEval']
__all__ = [
'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval',
class KeyPointTopDownCOCOEval(object):
......@@ -226,6 +229,164 @@ class KeyPointTopDownCOCOEval(object):
return self.eval_results
class KeyPointTopDownCOCOWholeBadyHandEval(object):
def __init__(self,
super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__()
self.coco = COCO(anno_file)
self.num_samples = num_samples
self.num_joints = num_joints
self.output_eval = output_eval
self.res_file = os.path.join(output_eval, "keypoints_results.json")
self.save_prediction_only = save_prediction_only
def parse_dataset(self):
gt_db = []
num_joints = self.num_joints
coco = self.coco
img_ids = coco.getImgIds()
for img_id in img_ids:
ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = coco.loadAnns(ann_ids)
for obj in objs:
for type in ['left', 'right']:
if (obj[f'{type}hand_valid'] and
max(obj[f'{type}hand_kpts']) > 0):
joints = np.zeros((num_joints, 3), dtype=np.float32)
joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
keypoints = np.array(obj[f'{type}hand_kpts'])
keypoints = keypoints.reshape(-1, 3)
joints[:, :2] = keypoints[:, :2]
joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
'bbox': obj[f'{type}hand_box'],
'gt_joints': joints,
'joints_vis': joints_vis,
self.db = gt_db
def reset(self):
self.results = {
'preds': np.zeros(
(self.num_samples, self.num_joints, 3), dtype=np.float32),
self.eval_results = {}
self.idx = 0
def update(self, inputs, outputs):
kpts, _ = outputs['keypoint'][0]
num_images = inputs['image'].shape[0]
self.results['preds'][self.idx:self.idx + num_images, :, 0:
3] = kpts[:, :, 0:3]
self.idx += num_images
def accumulate(self):
if self.save_prediction_only:
logger.info(f'The keypoint result is saved to {self.res_file} '
'and do not evaluate the mAP.')
self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE'))
def get_final_results(self, preds):
kpts = []
for idx, kpt in enumerate(preds):
kpts.append({'keypoints': kpt.tolist()})
def _write_keypoint_results(self, keypoints):
if not os.path.exists(self.output_eval):
with open(self.res_file, 'w') as f:
json.dump(keypoints, f, sort_keys=True, indent=4)
logger.info(f'The keypoint result is saved to {self.res_file}.')
except Exception:
content = []
with open(self.res_file, 'r') as f:
for line in f:
content[-1] = ']'
with open(self.res_file, 'w') as f:
for c in content:
def log(self):
if self.save_prediction_only:
for item, value in self.eval_results.items():
print("{} : {}".format(item, value))
def get_results(self):
return self.eval_results
def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30):
"""Keypoint evaluation.
res_file (str): Json file stored prediction results.
metrics (str | list[str]): Metric to be performed.
Options: 'PCK', 'AUC', 'EPE'.
pck_thr (float): PCK threshold, default as 0.2.
auc_nor (float): AUC normalization factor, default as 30 pixel.
List: Evaluation results for evaluation metric.
info_str = []
with open(res_file, 'r') as fin:
preds = json.load(fin)
assert len(preds) == len(self.db)
outputs = []
gts = []
masks = []
threshold_bbox = []
for pred, item in zip(preds, self.db):
outputs.append(np.array(pred['keypoints'])[:, :-1])
gts.append(np.array(item['gt_joints'])[:, :-1])
masks.append((np.array(item['joints_vis'])[:, 0]) > 0)
if 'PCK' in metrics:
bbox = np.array(item['bbox'])
bbox_thr = np.max(bbox[2:])
threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
outputs = np.array(outputs)
gts = np.array(gts)
masks = np.array(masks)
threshold_bbox = np.array(threshold_bbox)
if 'PCK' in metrics:
_, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
info_str.append(('PCK', pck))
if 'AUC' in metrics:
info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor)))
if 'EPE' in metrics:
info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))
name_value = OrderedDict(info_str)
return name_value
class KeyPointTopDownMPIIEval(object):
def __init__(self,
......@@ -401,3 +401,151 @@ def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
# Flip horizontally
output_flipped_back = output_flipped_back[..., ::-1]
return output_flipped_back
def _calc_distances(preds, targets, mask, normalize):
"""Calculate the normalized distances between preds and target.
batch_size: N
num_keypoints: K
dimension of keypoints: D (normally, D=2 or D=3)
preds (np.ndarray[N, K, D]): Predicted keypoint location.
targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (np.ndarray[N, D]): Typical value is heatmap_size
np.ndarray[K, N]: The normalized distances. \
If target keypoints are missing, the distance is -1.
N, K, _ = preds.shape
# set mask=0 when normalize==0
_mask = mask.copy()
_mask[np.where((normalize == 0).sum(1))[0], :] = False
distances = np.full((N, K), -1, dtype=np.float32)
# handle invalid values
normalize[np.where(normalize <= 0)] = 1e6
distances[_mask] = np.linalg.norm(
((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
return distances.T
def _distance_acc(distances, thr=0.5):
"""Return the percentage below the distance threshold, while ignoring
distances values with -1.
batch_size: N
distances (np.ndarray[N, ]): The normalized distances.
thr (float): Threshold of the distances.
float: Percentage of distances below the threshold. \
If all target keypoints are missing, return -1.
distance_valid = distances != -1
num_distance_valid = distance_valid.sum()
if num_distance_valid > 0:
return (distances[distance_valid] < thr).sum() / num_distance_valid
return -1
def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
PCK metric measures accuracy of the localization of the body joints.
The distances between predicted positions and the ground-truth ones
are typically normalized by the bounding box size.
The threshold (thr) of the normalized distance is commonly set
as 0.05, 0.1 or 0.2 etc.
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
thr (float): Threshold of PCK calculation.
normalize (np.ndarray[N, 2]): Normalization factor for H&W.
tuple: A tuple containing keypoint accuracy.
- acc (np.ndarray[K]): Accuracy of each keypoint.
- avg_acc (float): Averaged accuracy across all keypoints.
- cnt (int): Number of valid keypoints.
distances = _calc_distances(pred, gt, mask, normalize)
acc = np.array([_distance_acc(d, thr) for d in distances])
valid_acc = acc[acc >= 0]
cnt = len(valid_acc)
avg_acc = valid_acc.mean() if cnt > 0 else 0
return acc, avg_acc, cnt
def keypoint_auc(pred, gt, mask, normalize, num_step=20):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (float): Normalization factor.
float: Area under curve.
nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
x = [1.0 * i / num_step for i in range(num_step)]
y = []
for thr in x:
_, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
auc = 0
for i in range(num_step):
auc += 1.0 / num_step * y[i]
return auc
def keypoint_epe(pred, gt, mask):
"""Calculate the end-point error.
- batch_size: N
- num_keypoints: K
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
float: Average end-point error.
normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32)
distances = _calc_distances(pred, gt, mask, normalize)
distance_valid = distances[distances != -1]
return distance_valid.sum() / max(1, len(distance_valid))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册