add hrnet data, engine, metrics; rename Hrnet to HRNet (#2890)

a06a6258 · zhiboniu · GitHub · 90bfe009 · a06a6258 · a06a6258
8 changed file
--- a/ppdet/data/source/keypoint_coco.py
+++ b/ppdet/data/source/keypoint_coco.py
@@ -15,8 +15,8 @@
 import os
 import cv2
 import numpy as np
+import json
 import copy
-# TODO: unify xtococotools and pycocotools
 import pycocotools
 from pycocotools.coco import COCO
 from .dataset import DetDataset
@@ -317,4 +317,341 @@ class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):
        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)

        self.dataset_name = 'crowdpose'
-        print(f'=> num_images: {self.num_images}')
+        print('=> num_images: {}'.format(self.num_images))
+
+
+@serializable
+class KeypointTopDownBaseDataset(DetDataset):
+    """Base class for top_down datasets.
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[]):
+        super().__init__(dataset_dir, image_dir, anno_path)
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.img_prefix = os.path.join(dataset_dir, image_dir)
+        self.transform = transform
+
+        self.ann_info['num_joints'] = num_joints
+        self.db = []
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.db)
+
+    def _get_db(self):
+        """Get a sample"""
+        raise NotImplementedError
+
+    def __getitem__(self, idx):
+        """Prepare sample for training given the index."""
+        records = copy.deepcopy(self.db[idx])
+        records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |
+                                      cv2.IMREAD_IGNORE_ORIENTATION)
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records['score'] = records['score'] if 'score' in records else 1
+        records = self.transform(records)
+        # print('records', records)
+        return records
+
+
+@register
+@serializable
+class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
+    """COCO dataset for top-down pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes:
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+        bbox_file (str): Path to a detection bbox file
+            Default: None.
+        use_gt_bbox (bool): Whether to use ground truth bbox
+            Default: True.
+        pixel_std (int): The pixel std of the scale
+            Default: 200.
+        image_thre (float): The threshold to filter the detection box
+            Default: 0.0.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 trainsize,
+                 transform=[],
+                 bbox_file=None,
+                 use_gt_bbox=True,
+                 pixel_std=200,
+                 image_thre=0.0):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.bbox_file = bbox_file
+        self.use_gt_bbox = use_gt_bbox
+        self.trainsize = trainsize
+        self.pixel_std = pixel_std
+        self.image_thre = image_thre
+        self.dataset_name = 'coco'
+
+    def parse_dataset(self):
+        if self.use_gt_bbox:
+            self.db = self._load_coco_keypoint_annotations()
+        else:
+            self.db = self._load_coco_person_detection_results()
+
+    def _load_coco_keypoint_annotations(self):
+        coco = COCO(self.get_anno())
+        img_ids = coco.getImgIds()
+        gt_db = []
+        for index in img_ids:
+            im_ann = coco.loadImgs(index)[0]
+            width = im_ann['width']
+            height = im_ann['height']
+            file_name = im_ann['file_name']
+            im_id = int(im_ann["id"])
+
+            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
+            objs = coco.loadAnns(annIds)
+
+            valid_objs = []
+            for obj in objs:
+                x, y, w, h = obj['bbox']
+                x1 = np.max((0, x))
+                y1 = np.max((0, y))
+                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
+                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
+                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                    obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                    valid_objs.append(obj)
+            objs = valid_objs
+
+            rec = []
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+
+                joints = np.zeros(
+                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                joints_vis = np.zeros(
+                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                for ipt in range(self.ann_info['num_joints']):
+                    joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
+                    joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
+                    joints[ipt, 2] = 0
+                    t_vis = obj['keypoints'][ipt * 3 + 2]
+                    if t_vis > 1:
+                        t_vis = 1
+                    joints_vis[ipt, 0] = t_vis
+                    joints_vis[ipt, 1] = t_vis
+                    joints_vis[ipt, 2] = 0
+
+                center, scale = self._box2cs(obj['clean_bbox'][:4])
+                rec.append({
+                    'image_file': os.path.join(self.img_prefix, file_name),
+                    'center': center,
+                    'scale': scale,
+                    'joints': joints,
+                    'joints_vis': joints_vis,
+                    'im_id': im_id,
+                })
+            gt_db.extend(rec)
+
+        return gt_db
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            scale = scale * 1.25
+
+        return center, scale
+
+    def _load_coco_person_detection_results(self):
+        all_boxes = None
+        bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)
+        with open(bbox_file_path, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            print('=> Load %s fail!' % bbox_file_path)
+            return None
+
+        kpt_db = []
+        for n_img in range(0, len(all_boxes)):
+            det_res = all_boxes[n_img]
+            if det_res['category_id'] != 1:
+                continue
+            file_name = det_res[
+                'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[
+                    'image_id']
+            img_name = os.path.join(self.img_prefix, file_name)
+            box = det_res['bbox']
+            score = det_res['score']
+            im_id = int(det_res['image_id'])
+
+            if score < self.image_thre:
+                continue
+
+            center, scale = self._box2cs(box)
+            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints_vis = np.ones(
+                (self.ann_info['num_joints'], 3), dtype=np.float)
+            kpt_db.append({
+                'image_file': img_name,
+                'im_id': im_id,
+                'center': center,
+                'scale': scale,
+                'score': score,
+                'joints': joints,
+                'joints_vis': joints_vis,
+            })
+
+        return kpt_db
+
+
+@register
+@serializable
+class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
+    """MPII dataset for topdown pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII keypoint indexes::
+
+        0: 'right_ankle',
+        1: 'right_knee',
+        2: 'right_hip',
+        3: 'left_hip',
+        4: 'left_knee',
+        5: 'left_ankle',
+        6: 'pelvis',
+        7: 'thorax',
+        8: 'upper_neck',
+        9: 'head_top',
+        10: 'right_wrist',
+        11: 'right_elbow',
+        12: 'right_shoulder',
+        13: 'left_shoulder',
+        14: 'left_elbow',
+        15: 'left_wrist',
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[]):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.dataset_name = 'mpii'
+
+    def parse_dataset(self):
+        with open(self.get_anno()) as anno_file:
+            anno = json.load(anno_file)
+
+        gt_db = []
+        for a in anno:
+            image_name = a['image']
+            im_id = a['image_id'] if 'image_id' in a else int(
+                os.path.splitext(image_name)[0])
+
+            c = np.array(a['center'], dtype=np.float)
+            s = np.array([a['scale'], a['scale']], dtype=np.float)
+
+            # Adjust center/scale slightly to avoid cropping limbs
+            if c[0] != -1:
+                c[1] = c[1] + 15 * s[1]
+                s = s * 1.25
+            c = c - 1
+
+            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints_vis = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float)
+            if 'joints' in a:
+                joints_ = np.array(a['joints'])
+                joints_[:, 0:2] = joints_[:, 0:2] - 1
+                joints_vis_ = np.array(a['joints_vis'])
+                assert len(joints_) == self.ann_info[
+                    'num_joints'], 'joint num diff: {} vs {}'.format(
+                        len(joints_), self.ann_info['num_joints'])
+
+                joints[:, 0:2] = joints_[:, 0:2]
+                joints_vis[:, 0] = joints_vis_[:]
+                joints_vis[:, 1] = joints_vis_[:]
+
+            gt_db.append({
+                'image_file': os.path.join(self.img_prefix, image_name),
+                'im_id': im_id,
+                'center': c,
+                'scale': s,
+                'joints': joints,
+                'joints_vis': joints_vis
+            })
+        self.db = gt_db
--- a/ppdet/data/transform/keypoint_operators.py
+++ b/ppdet/data/transform/keypoint_operators.py
@@ -29,7 +29,7 @@ import math
 import copy
 import os

-from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints
+from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform
 from ppdet.core.workspace import serializable
 from ppdet.utils.logger import setup_logger
 logger = setup_logger(__name__)
@@ -38,7 +38,8 @@ registered_ops = []

 __all__ = [
    'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
-    'NormalizePermute', 'EvalAffine'
+    'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
+    'TopDownAffine', 'ToHeatmapsTopDown'
 ]


@@ -403,3 +404,229 @@ class ToHeatmaps(object):
            records['mask_{}x'.format(idx + 1)] = mask
        del records['mask']
        return records
+
+
+@register_keypointop
+class RandomFlipHalfBodyTransform(object):
+    """apply data augment to image and coords
+    to achieve the flip, scale, rotate and half body transform effect for training image
+
+    Args:
+        trainsize (list):[w, h], Image target size
+        upper_body_ids (list): The upper body joint ids
+        flip_pairs (list): The left-right joints exchange order list
+        pixel_std (int): The pixel std of the scale
+        scale (float): The scale factor to transform the image
+        rot (int): The rotate factor to transform the image
+        num_joints_half_body (int): The joints threshold of the half body transform
+        prob_half_body (float): The threshold of the half body transform
+        flip (bool): Whether to flip the image
+
+    Returns:
+        records(dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 trainsize,
+                 upper_body_ids,
+                 flip_pairs,
+                 pixel_std,
+                 scale=0.35,
+                 rot=40,
+                 num_joints_half_body=8,
+                 prob_half_body=0.3,
+                 flip=True,
+                 rot_prob=0.6):
+        super(RandomFlipHalfBodyTransform, self).__init__()
+        self.trainsize = trainsize
+        self.upper_body_ids = upper_body_ids
+        self.flip_pairs = flip_pairs
+        self.pixel_std = pixel_std
+        self.scale = scale
+        self.rot = rot
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+        self.flip = flip
+        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
+        self.rot_prob = rot_prob
+
+    def halfbody_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(joints.shape[0]):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints if len(
+                lower_joints) > 2 else upper_joints
+        if len(selected_joints) < 2:
+            return None, None
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        scale = scale * 1.5
+
+        return center, scale
+
+    def flip_joints(self, joints, joints_vis, width, matched_parts):
+        joints[:, 0] = width - joints[:, 0] - 1
+        for pair in matched_parts:
+            joints[pair[0], :], joints[pair[1], :] = \
+                joints[pair[1], :], joints[pair[0], :].copy()
+            joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+                joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+
+        return joints * joints_vis, joints_vis
+
+    def __call__(self, records):
+        image = records['image']
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        c = records['center']
+        s = records['scale']
+        r = 0
+        if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and
+                np.random.rand() < self.prob_half_body):
+            c_half_body, s_half_body = self.halfbody_transform(joints,
+                                                               joints_vis)
+            if c_half_body is not None and s_half_body is not None:
+                c, s = c_half_body, s_half_body
+        sf = self.scale
+        rf = self.rot
+        s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        r = np.clip(np.random.randn() * rf, -rf * 2,
+                    rf * 2) if np.random.random() <= self.rot_prob else 0
+
+        if self.flip and np.random.random() <= 0.5:
+            image = image[:, ::-1, :]
+            joints, joints_vis = self.flip_joints(
+                joints, joints_vis, image.shape[1], self.flip_pairs)
+            c[0] = image.shape[1] - c[0] - 1
+        records['image'] = image
+        records['joints'] = joints
+        records['joints_vis'] = joints_vis
+        records['center'] = c
+        records['scale'] = s
+        records['rotate'] = r
+
+        return records
+
+
+@register_keypointop
+class TopDownAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, trainsize):
+        self.trainsize = trainsize
+
+    def __call__(self, records):
+        image = records['image']
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        rot = records['rotate'] if "rotate" in records else 0
+        trans = get_affine_transform(records['center'], records['scale'] * 200,
+                                     rot, self.trainsize)
+        image = cv2.warpAffine(
+            image,
+            trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+            flags=cv2.INTER_LINEAR)
+        for i in range(joints.shape[0]):
+            if joints_vis[i, 0] > 0.0:
+                joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+        records['image'] = image
+        records['joints'] = joints
+
+        return records
+
+
+@register_keypointop
+class ToHeatmapsTopDown(object):
+    """to generate the gaussin heatmaps of keypoint for heatmap loss
+
+    Args:
+        hmsize (list): [w, h] output heatmap's size
+        sigma (float): the std of gaussin kernel genereted
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the heatmaps used to heatmaploss
+
+    """
+
+    def __init__(self, hmsize, sigma):
+        super(ToHeatmapsTopDown, self).__init__()
+        self.hmsize = np.array(hmsize)
+        self.sigma = sigma
+
+    def __call__(self, records):
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        num_joints = joints.shape[0]
+        image_size = np.array(
+            [records['image'].shape[1], records['image'].shape[0]])
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_vis[:, 0]
+        target = np.zeros(
+            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
+        tmp_size = self.sigma * 3
+        for joint_id in range(num_joints):
+            feat_stride = image_size / self.hmsize
+            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
+            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
+            # Check that any part of the gaussian is in-bounds
+            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
+                    0] < 0 or br[1] < 0:
+                # If not, just return the image as is
+                target_weight[joint_id] = 0
+                continue
+            # # Generate gaussian
+            size = 2 * tmp_size + 1
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, np.newaxis]
+            x0 = y0 = size // 2
+            # The gaussian is not normalized, we want the center value to equal 1
+            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
+
+            # Usable gaussian range
+            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
+            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
+            # Image range
+            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
+            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
+
+            v = target_weight[joint_id]
+            if v > 0.5:
+                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
+                    0]:g_y[1], g_x[0]:g_x[1]]
+        records['target'] = target
+        records['target_weight'] = target_weight
+        del records['joints'], records['joints_vis']
+
+        return records
--- a/ppdet/engine/callbacks.py
+++ b/ppdet/engine/callbacks.py
@@ -166,7 +166,12 @@ class Checkpointer(Callback):
                if 'save_best_model' in status and status['save_best_model']:
                    for metric in self.model._metrics:
                        map_res = metric.get_results()
-                        key = 'bbox' if 'bbox' in map_res else 'mask'
+                        if 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
                        if key not in map_res:
                            logger.warn("Evaluation results empty, this may be due to " \
                                        "training iterations being too few or not " \

--- a/ppdet/engine/export_utils.py
+++ b/ppdet/engine/export_utils.py
@@ -37,10 +37,11 @@ TRT_MIN_SUBGRAPH = {
    'TTFNet': 3,
    'FCOS': 16,
    'SOLOv2': 60,
-    'HigherHrnet': 40,
+    'HigherHRNet': 3,
+    'HRNet': 3,
 }

-KEYPOINT_ARCH = ['HigherHrnet', 'Hrnet']
+KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']


 def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):

--- a/ppdet/engine/trainer.py
+++ b/ppdet/engine/trainer.py
@@ -33,7 +33,7 @@ from ppdet.optimizer import ModelEMA
 from ppdet.core.workspace import create
 from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from ppdet.utils.visualizer import visualize_results, save_result
-from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results
+from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval
 from ppdet.data.source.category import get_categories
 import ppdet.utils.stats as stats

@@ -173,6 +173,15 @@ class Trainer(object):
                    anno_file=self.dataset.get_anno(),
                    multi_scale=multi_scale)
            ]
+        elif self.cfg.metric == 'KeyPointTopDownCOCOEval':
+            eval_dataset = self.cfg['EvalDataset']
+            eval_dataset.check_or_download_dataset()
+            anno_file = eval_dataset.get_anno()
+            self._metrics = [
+                KeyPointTopDownCOCOEval(anno_file,
+                                        len(eval_dataset), self.cfg.num_joints,
+                                        self.cfg.save_dir)
+            ]
        else:
            logger.warn("Metric not support for metric type {}".format(
                self.cfg.metric))
@@ -374,6 +383,7 @@ class Trainer(object):
            self.status['step_id'] = step_id
            # forward
            outs = self.model(data)
+
            for key in ['im_shape', 'scale_factor', 'im_id']:
                outs[key] = data[key]
            for key, value in outs.items():

--- a/ppdet/metrics/__init__.py
+++ b/ppdet/metrics/__init__.py
@@ -13,7 +13,8 @@
 # limitations under the License.

 from . import metrics
-
+from . import keypoint_metrics
 from .metrics import *
+from .keypoint_metrics import *

-__all__ = metrics.__all__
+__all__ = metrics.__all__ + keypoint_metrics.__all__
--- a/ppdet/metrics/keypoint_metrics.py
+++ b/ppdet/metrics/keypoint_metrics.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import copy
+import os
+import json
+from collections import OrderedDict
+from collections import defaultdict
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from ..modeling.keypoint_utils import oks_nms
+
+__all__ = ['KeyPointTopDownCOCOEval']
+
+
+class KeyPointTopDownCOCOEval(object):
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 iou_type='keypoints',
+                 in_vis_thre=0.2,
+                 oks_thre=0.9):
+        super(KeyPointTopDownCOCOEval, self).__init__()
+        self.coco = COCO(anno_file)
+        self.num_samples = num_samples
+        self.num_joints = num_joints
+        self.iou_type = iou_type
+        self.in_vis_thre = in_vis_thre
+        self.oks_thre = oks_thre
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.reset()
+
+    def reset(self):
+        self.results = {
+            'all_preds': np.zeros(
+                (self.num_samples, self.num_joints, 3), dtype=np.float32),
+            'all_boxes': np.zeros((self.num_samples, 6)),
+            'image_path': []
+        }
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpt_coord = outputs['kpt_coord']
+        kpt_score = outputs['kpt_score']
+        num_images = inputs['image'].shape[0]
+        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
+                                  2] = kpt_coord[:, :, 0:2]
+        self.results['all_preds'][self.idx:self.idx + num_images, :, 2:
+                                  3] = kpt_score
+        self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
+            'center'].numpy()[:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
+            'scale'].numpy()[:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
+            inputs['scale'].numpy() * 200, 1)
+        self.results['all_boxes'][self.idx:self.idx + num_images,
+                                  5] = np.squeeze(inputs['score'].numpy())
+        self.results['image_path'].extend(inputs['im_id'].numpy())
+
+        self.idx += num_images
+
+    def _write_coco_keypoint_results(self, keypoints):
+        data_pack = [{
+            'cat_id': 1,
+            'cls': 'person',
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        }]
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+        if not os.path.exists(self.output_eval):
+            os.makedirs(self.output_eval)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+        try:
+            json.load(open(self.res_file))
+        except Exception:
+            content = []
+            with open(self.res_file, 'r') as f:
+                for line in f:
+                    content.append(line)
+            content[-1] = ']'
+            with open(self.res_file, 'w') as f:
+                for c in content:
+                    f.write(c)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
+            _key_points = _key_points.reshape(_key_points.shape[0], -1)
+
+            result = [{
+                'image_id': img_kpts[k]['image'],
+                'category_id': cat_id,
+                'keypoints': list(_key_points[k]),
+                'score': img_kpts[k]['score'],
+                'center': list(img_kpts[k]['center']),
+                'scale': list(img_kpts[k]['scale'])
+            } for k in range(len(img_kpts))]
+            cat_results.extend(result)
+
+        return cat_results
+
+    def get_final_results(self, preds, all_boxes, img_path):
+        _kpts = []
+        for idx, kpt in enumerate(preds):
+            _kpts.append({
+                'keypoints': kpt,
+                'center': all_boxes[idx][0:2],
+                'scale': all_boxes[idx][2:4],
+                'area': all_boxes[idx][4],
+                'score': all_boxes[idx][5],
+                'image': int(img_path[idx])
+            })
+        # image x person x (keypoints)
+        kpts = defaultdict(list)
+        for kpt in _kpts:
+            kpts[kpt['image']].append(kpt)
+
+        # rescoring and oks nms
+        num_joints = preds.shape[1]
+        in_vis_thre = self.in_vis_thre
+        oks_thre = self.oks_thre
+        oks_nmsed_kpts = []
+        for img in kpts.keys():
+            img_kpts = kpts[img]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > in_vis_thre:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
+                           oks_thre)
+
+            if len(keep) == 0:
+                oks_nmsed_kpts.append(img_kpts)
+            else:
+                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])
+
+        self._write_coco_keypoint_results(oks_nmsed_kpts)
+
+    def accumulate(self):
+        self.get_final_results(self.results['all_preds'],
+                               self.results['all_boxes'],
+                               self.results['image_path'])
+        coco_dt = self.coco.loadRes(self.res_file)
+        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        keypoint_stats = []
+        for ind in range(len(coco_eval.stats)):
+            keypoint_stats.append((coco_eval.stats[ind]))
+        self.eval_results['keypoint'] = keypoint_stats
+
+    def log(self):
+        stats_names = [
+            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results
--- a/ppdet/optimizer.py
+++ b/ppdet/optimizer.py
@@ -139,6 +139,7 @@ class LinearWarmup(object):
        boundary = []
        value = []
        for i in range(self.steps + 1):
+            if self.steps > 0:
                alpha = i / self.steps
                factor = self.start_factor * (1 - alpha) + alpha
                lr = base_lr * factor