[WIP] Add Keypoint R-CNN (#69)

* [WIP] Keypoints inference on C2 models work * Training seems to work Still gives slightly worse results * e2e training works but gives 3 and 5 mAP less * Add modification proposed by @ChangErgou Improves mAP by 1.5 points, to 0.514 and 0.609 * Keypoints reproduce expected results * Clean coco.py * Linter + remove unnecessary code * Merge criteria for empty bboxes in has_valid_annotation * Remove trailing print * Add demo support for keypoints Still need further cleanups and improvements, like adding fields support for the other ops in Keypoints * More cleanups and misc improvements * Fixes after rebase * Add information to the readme * Fix md formatting

[WIP] Add Keypoint R-CNN (#69)
* [WIP] Keypoints inference on C2 models work * Training seems to work Still gives slightly worse results * e2e training works but gives 3 and 5 mAP less * Add modification proposed by @ChangErgou Improves mAP by 1.5 points, to 0.514 and 0.609 * Keypoints reproduce expected results * Clean coco.py * Linter + remove unnecessary code * Merge criteria for empty bboxes in has_valid_annotation * Remove trailing print * Add demo support for keypoints Still need further cleanups and improvements, like adding fields support for the other ops in Keypoints * More cleanups and misc improvements * Fixes after rebase * Add information to the readme * Fix md formatting
e0a525a0 · Francisco Massa · GitHub · 1589ce09 · e0a525a0 · e0a525a0
28 changed file
--- a/MODEL_ZOO.md
+++ b/MODEL_ZOO.md
@@ -27,6 +27,13 @@ R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 |
 R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth)
 X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth)

+For person keypoint detection:
+
+backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | keypoint AP | model id
+-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
+R-50-FPN | Keypoint | 1x | 2 | 5.7 | 0.3771 | 9.4 | 0.10941 | 53.7 | 64.3 | 9981060
+
+

 ## Comparison with Detectron and mmdetection


--- a/README.md
+++ b/README.md
@@ -28,6 +28,8 @@ python webcam.py --min-image-size 300 MODEL.DEVICE cpu
 python webcam.py --config-file ../configs/caffe2/e2e_mask_rcnn_R_101_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu
 # in order to see the probability heatmaps, pass --show-mask-heatmaps
 python webcam.py --min-image-size 300 --show-mask-heatmaps MODEL.DEVICE cpu
+# for the keypoint demo
+python webcam.py --config-file ../configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml --min-image-size 300 MODEL.DEVICE cpu
 ```

 A notebook with the demo can be found in [demo/Mask_R-CNN_demo.ipynb](demo/Mask_R-CNN_demo.ipynb).

--- a/configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml
+++ b/configs/caffe2/e2e_keypoint_rcnn_R_50_FPN_1x_caffe2.yaml
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://Caffe2Detectron/COCO/37697547/e2e_keypoint_rcnn_R-50-FPN_1x"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+    NUM_CLASSES: 2
+  ROI_KEYPOINT_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor"
+    PREDICTOR: "KeypointRCNNPredictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 56
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  KEYPOINT_ON: True
+DATASETS:
+  TRAIN: ("keypoints_coco_2014_train", "keypoints_coco_2014_valminusminival",)
+  TEST: ("keypoints_coco_2014_minival",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
--- a/configs/e2e_keypoint_rcnn_R_50_FPN_1x.yaml
+++ b/configs/e2e_keypoint_rcnn_R_50_FPN_1x.yaml
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+    NUM_CLASSES: 2
+  ROI_KEYPOINT_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor"
+    PREDICTOR: "KeypointRCNNPredictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 56
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  KEYPOINT_ON: True
+DATASETS:
+  TRAIN: ("keypoints_coco_2014_train", "keypoints_coco_2014_valminusminival",)
+  TEST: ("keypoints_coco_2014_minival",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
--- a/configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml
+++ b/configs/quick_schedules/e2e_faster_rcnn_R_50_C4_quick.yaml
@@ -10,7 +10,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml
+++ b/configs/quick_schedules/e2e_faster_rcnn_R_50_FPN_quick.yaml
@@ -24,7 +24,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml
+++ b/configs/quick_schedules/e2e_faster_rcnn_X_101_32x8d_FPN_quick.yaml
@@ -28,7 +28,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/e2e_keypoint_rcnn_R_50_FPN_quick.yaml
+++ b/configs/quick_schedules/e2e_keypoint_rcnn_R_50_FPN_quick.yaml
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+    OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+    BATCH_SIZE_PER_IMAGE: 256
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+    NUM_CLASSES: 2
+  ROI_KEYPOINT_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "KeypointRCNNFeatureExtractor"
+    PREDICTOR: "KeypointRCNNPredictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 56
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  KEYPOINT_ON: True
+DATASETS:
+  TRAIN: ("keypoints_coco_2014_minival",)
+  TEST: ("keypoints_coco_2014_minival",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1000
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  BASE_LR: 0.005
+  WEIGHT_DECAY: 0.0001
+  STEPS: (1500,)
+  MAX_ITER: 2000
+  IMS_PER_BATCH: 4
+TEST:
+  IMS_PER_BATCH: 2
--- a/configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml
+++ b/configs/quick_schedules/e2e_mask_rcnn_R_50_C4_quick.yaml
@@ -14,7 +14,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml
+++ b/configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml
@@ -33,7 +33,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml
+++ b/configs/quick_schedules/e2e_mask_rcnn_X_101_32x8d_FPN_quick.yaml
@@ -37,7 +37,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/rpn_R_50_C4_quick.yaml
+++ b/configs/quick_schedules/rpn_R_50_C4_quick.yaml
@@ -9,7 +9,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/configs/quick_schedules/rpn_R_50_FPN_quick.yaml
+++ b/configs/quick_schedules/rpn_R_50_FPN_quick.yaml
@@ -15,7 +15,7 @@ DATASETS:
  TRAIN: ("coco_2014_minival",)
  TEST: ("coco_2014_minival",)
 INPUT:
-  MIN_SIZE_TRAIN: 600
+  MIN_SIZE_TRAIN: (600,)
  MAX_SIZE_TRAIN: 1000
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1000

--- a/demo/predictor.py
+++ b/demo/predictor.py
@@ -178,6 +178,8 @@ class COCODemo(object):
        result = self.overlay_boxes(result, top_predictions)
        if self.cfg.MODEL.MASK_ON:
            result = self.overlay_mask(result, top_predictions)
+        if self.cfg.MODEL.KEYPOINT_ON:
+            result = self.overlay_keypoints(result, top_predictions)
        result = self.overlay_class_names(result, top_predictions)

        return result
@@ -297,6 +299,15 @@ class COCODemo(object):

        return composite

+    def overlay_keypoints(self, image, predictions):
+        keypoints = predictions.get_field("keypoints")
+        kps = keypoints.keypoints
+        scores = keypoints.get_field("logits")
+        kps = torch.cat((kps[:, :, 0:2], scores[:, :, None]), dim=2).numpy()
+        for region in kps:
+            image = vis_keypoints(image, region.transpose((1, 0)))
+        return image
+
    def create_mask_montage(self, image, predictions):
        """
        Create a montage showing the probability heatmaps for each one one of the
@@ -357,3 +368,67 @@ class COCODemo(object):
            )

        return image
+
+import numpy as np
+import matplotlib.pyplot as plt
+from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
+
+def vis_keypoints(img, kps, kp_thresh=2, alpha=0.7):
+    """Visualizes keypoints (adapted from vis_one_image).
+    kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob).
+    """
+    dataset_keypoints = PersonKeypoints.NAMES
+    kp_lines = PersonKeypoints.CONNECTIONS
+
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kp_lines) + 2)]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw mid shoulder / mid hip first for better visualization.
+    mid_shoulder = (
+        kps[:2, dataset_keypoints.index('right_shoulder')] +
+        kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0
+    sc_mid_shoulder = np.minimum(
+        kps[2, dataset_keypoints.index('right_shoulder')],
+        kps[2, dataset_keypoints.index('left_shoulder')])
+    mid_hip = (
+        kps[:2, dataset_keypoints.index('right_hip')] +
+        kps[:2, dataset_keypoints.index('left_hip')]) / 2.0
+    sc_mid_hip = np.minimum(
+        kps[2, dataset_keypoints.index('right_hip')],
+        kps[2, dataset_keypoints.index('left_hip')])
+    nose_idx = dataset_keypoints.index('nose')
+    if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh:
+        cv2.line(
+            kp_mask, tuple(mid_shoulder), tuple(kps[:2, nose_idx]),
+            color=colors[len(kp_lines)], thickness=2, lineType=cv2.LINE_AA)
+    if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh:
+        cv2.line(
+            kp_mask, tuple(mid_shoulder), tuple(mid_hip),
+            color=colors[len(kp_lines) + 1], thickness=2, lineType=cv2.LINE_AA)
+
+    # Draw the keypoints.
+    for l in range(len(kp_lines)):
+        i1 = kp_lines[l][0]
+        i2 = kp_lines[l][1]
+        p1 = kps[0, i1], kps[1, i1]
+        p2 = kps[0, i2], kps[1, i2]
+        if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh:
+            cv2.line(
+                kp_mask, p1, p2,
+                color=colors[l], thickness=2, lineType=cv2.LINE_AA)
+        if kps[2, i1] > kp_thresh:
+            cv2.circle(
+                kp_mask, p1,
+                radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
+        if kps[2, i2] > kp_thresh:
+            cv2.circle(
+                kp_mask, p2,
+                radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
--- a/maskrcnn_benchmark/config/defaults.py
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -23,6 +23,7 @@ _C = CN()
 _C.MODEL = CN()
 _C.MODEL.RPN_ONLY = False
 _C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
 _C.MODEL.DEVICE = "cuda"
 _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
 _C.MODEL.CLS_AGNOSTIC_BBOX_REG = False
@@ -38,7 +39,7 @@ _C.MODEL.WEIGHT = ""
 # -----------------------------------------------------------------------------
 _C.INPUT = CN()
 # Size of the smallest side of the image during training
-_C.INPUT.MIN_SIZE_TRAIN = 800  # (800,)
+_C.INPUT.MIN_SIZE_TRAIN = (800,)  # (800,)
 # Maximum size of the side of the image during training
 _C.INPUT.MAX_SIZE_TRAIN = 1333
 # Size of the smallest side of the image during testing
@@ -232,6 +233,18 @@ _C.MODEL.ROI_MASK_HEAD.DILATION = 1
 # GN
 _C.MODEL.ROI_MASK_HEAD.USE_GN = False

+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
+_C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
+_C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+
 # ---------------------------------------------------------------------------- #
 # ResNe[X]t options (ResNets = {ResNet, ResNeXt}
 # Note that parts of a resnet may be used for both the backbone and the head

--- a/maskrcnn_benchmark/config/paths_catalog.py
+++ b/maskrcnn_benchmark/config/paths_catalog.py
@@ -31,6 +31,22 @@ class DatasetCatalog(object):
            "img_dir": "coco/val2014",
            "ann_file": "coco/annotations/instances_valminusminival2014.json"
        },
+        "keypoints_coco_2014_train": {
+            "img_dir": "coco/train2014",
+            "ann_file": "annotations/person_keypoints_train2017_train_mod2.json",
+        },
+        "keypoints_coco_2014_val": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/instances_val2014.json"
+        },
+        "keypoints_coco_2014_minival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "annotations/person_keypoints_val2017_mod.json",
+        },
+        "keypoints_coco_2014_valminusminival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "annotations/person_keypoints_train2017_valminusminival_mod2.json",
+        },
        "voc_2007_train": {
            "data_dir": "voc/VOC2007",
            "split": "train"
@@ -92,6 +108,18 @@ class DatasetCatalog(object):

    @staticmethod
    def get(name):
+        if "keypoints" in name:
+            data_dir = DatasetCatalog.DATA_DIR
+            anno_dir = "/private/home/fmassa/coco_trainval2017"
+            attrs = DatasetCatalog.DATASETS[name]
+            args = dict(
+                root=os.path.join(data_dir, attrs["img_dir"]),
+                ann_file=os.path.join(anno_dir, attrs["ann_file"]),
+            )
+            return dict(
+                factory="COCODataset",
+                args=args,
+            )
        if "coco" in name:
            data_dir = DatasetCatalog.DATA_DIR
            attrs = DatasetCatalog.DATASETS[name]
@@ -127,7 +155,7 @@ class ModelCatalog(object):
        "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
    }

-    C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
+    C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl"
    C2_DETECTRON_MODELS = {
        "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
@@ -138,6 +166,8 @@ class ModelCatalog(object):
        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
        "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK",
+        # keypoints
+        "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao"
    }

    @staticmethod
@@ -162,7 +192,8 @@ class ModelCatalog(object):
        # prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
        # we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
        prefix = ModelCatalog.S3_C2_DETECTRON_URL
-        suffix = ModelCatalog.C2_DETECTRON_SUFFIX
+        dataset_tag = "keypoints_" if "keypoint" in name else ""
+        suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag)
        # remove identification prefix
        name = name[len("Caffe2Detectron/COCO/"):]
        # split in <model_id> and <model_name>

--- a/maskrcnn_benchmark/data/datasets/coco.py
+++ b/maskrcnn_benchmark/data/datasets/coco.py
@@ -4,6 +4,36 @@ import torchvision

 from maskrcnn_benchmark.structures.bounding_box import BoxList
 from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
+from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
+
+
+min_keypoints_per_image = 10
+
+
+def _count_visible_keypoints(anno):
+    return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+
+def _has_only_empty_bbox(anno):
+    return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+
+def has_valid_annotation(anno):
+    # if it's empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    # if all boxes have close to zero area, there is no annotation
+    if _has_only_empty_bbox(anno):
+        return False
+    # keypoints task have a slight different critera for considering
+    # if an annotation is valid
+    if "keypoints" not in anno[0]:
+        return True
+    # for keypoint detection tasks, only consider valid images those
+    # containing at least min_keypoints_per_image
+    if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+        return True
+    return False


 class COCODataset(torchvision.datasets.coco.CocoDetection):
@@ -16,26 +46,13 @@ class COCODataset(torchvision.datasets.coco.CocoDetection):

        # filter images without detection annotations
        if remove_images_without_annotations:
-            self.ids = [
-                img_id
-                for img_id in self.ids
-                if len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
-            ]
-
-            ids_to_remove = []
+            ids = []
            for img_id in self.ids:
-                ann_ids = self.coco.getAnnIds(imgIds=img_id)
+                ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
                anno = self.coco.loadAnns(ann_ids)
-                if all(
-                    any(o <= 1 for o in obj["bbox"][2:])
-                    for obj in anno
-                    if obj["iscrowd"] == 0
-                ):
-                    ids_to_remove.append(img_id)
-
-            self.ids = [
-                img_id for img_id in self.ids if img_id not in ids_to_remove
-            ]
+                if has_valid_annotation(anno):
+                    ids.append(img_id)
+            self.ids = ids

        self.json_category_id_to_contiguous_id = {
            v: i + 1 for i, v in enumerate(self.coco.getCatIds())
@@ -66,6 +83,11 @@ class COCODataset(torchvision.datasets.coco.CocoDetection):
        masks = SegmentationMask(masks, img.size)
        target.add_field("masks", masks)

+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = PersonKeypoints(keypoints, img.size)
+            target.add_field("keypoints", keypoints)
+
        target = target.clip_to_image(remove_empty=True)

        if self.transforms is not None:

--- a/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py
@@ -45,6 +45,9 @@ def do_coco_evaluation(
    if "segm" in iou_types:
        logger.info("Preparing segm results")
        coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset)
+    if 'keypoints' in iou_types:
+        logger.info('Preparing keypoints results')
+        coco_results['keypoints'] = prepare_for_coco_keypoint(predictions, dataset)

    results = COCOResults(*iou_types)
    logger.info("Evaluating predictions")
@@ -152,6 +155,36 @@ def prepare_for_coco_segmentation(predictions, dataset):
    return coco_results


+def prepare_for_coco_keypoint(predictions, dataset):
+    # assert isinstance(dataset, COCODataset)
+    coco_results = []
+    for image_id, prediction in enumerate(predictions):
+        original_id = dataset.id_to_img_map[image_id]
+        if len(prediction.bbox) == 0:
+            continue
+
+        # TODO replace with get_img_info?
+        image_width = dataset.coco.imgs[original_id]['width']
+        image_height = dataset.coco.imgs[original_id]['height']
+        prediction = prediction.resize((image_width, image_height))
+        prediction = prediction.convert('xywh')
+
+        boxes = prediction.bbox.tolist()
+        scores = prediction.get_field('scores').tolist()
+        labels = prediction.get_field('labels').tolist()
+        keypoints = prediction.get_field('keypoints')
+        keypoints = keypoints.resize((image_width, image_height))
+        keypoints = keypoints.keypoints.view(keypoints.keypoints.shape[0], -1).tolist()
+
+        mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels]
+
+        coco_results.extend([{
+            'image_id': original_id,
+            'category_id': mapped_labels[k],
+            'keypoints': keypoint,
+            'score': scores[k]} for k, keypoint in enumerate(keypoints)])
+    return coco_results
+
 # inspired from Detectron
 def evaluate_box_proposals(
    predictions, dataset, thresholds=None, area="all", limit=None
@@ -304,11 +337,11 @@ class COCOResults(object):
            "ARm@1000",
            "ARl@1000",
        ],
-        "keypoint": ["AP", "AP50", "AP75", "APm", "APl"],
+        "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
    }

    def __init__(self, *iou_types):
-        allowed_types = ("box_proposal", "bbox", "segm")
+        allowed_types = ("box_proposal", "bbox", "segm", "keypoints")
        assert all(iou_type in allowed_types for iou_type in iou_types)
        results = OrderedDict()
        for iou_type in iou_types:

--- a/maskrcnn_benchmark/data/transforms/transforms.py
+++ b/maskrcnn_benchmark/data/transforms/transforms.py
@@ -26,13 +26,15 @@ class Compose(object):

 class Resize(object):
    def __init__(self, min_size, max_size):
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
        self.min_size = min_size
        self.max_size = max_size

    # modified from torchvision to add support for max size
    def get_size(self, image_size):
        w, h = image_size
-        size = self.min_size
+        size = random.choice(self.min_size)
        max_size = self.max_size
        if max_size is not None:
            min_original_size = float(min((w, h)))

--- a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py
+import torch
+from torch import nn
+
+
+class KeypointPostProcessor(nn.Module):
+    def __init__(self, keypointer=None):
+        super(KeypointPostProcessor, self).__init__()
+        self.keypointer = keypointer
+
+    def forward(self, x, boxes):
+        mask_prob = x
+
+        scores = None
+        if self.keypointer:
+            mask_prob, scores = self.keypointer(x, boxes)
+
+        assert len(boxes) == 1, "Only non-batched inference supported for now"
+        boxes_per_image = [box.bbox.size(0) for box in boxes]
+        mask_prob = mask_prob.split(boxes_per_image, dim=0)
+        scores = scores.split(boxes_per_image, dim=0)
+
+        results = []
+        for prob, box, score in zip(mask_prob, boxes, scores):
+            bbox = BoxList(box.bbox, box.size, mode="xyxy")
+            for field in box.fields():
+                bbox.add_field(field, box.get_field(field))
+            prob = PersonKeypoints(prob, box.size)
+            prob.add_field("logits", score)
+            bbox.add_field("keypoints", prob)
+            results.append(bbox)
+
+        return results
+
+
+# TODO remove and use only the Keypointer
+import numpy as np
+import cv2
+
+
+def heatmaps_to_keypoints(maps, rois):
+    """Extract predicted keypoint locations from heatmaps. Output has shape
+    (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
+    for each keypoint.
+    """
+    # This function converts a discrete image coordinate in a HEATMAP_SIZE x
+    # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
+    # consistency with keypoints_to_heatmap_labels by using the conversion from
+    # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
+    # continuous coordinate.
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = rois[:, 2] - rois[:, 0]
+    heights = rois[:, 3] - rois[:, 1]
+    widths = np.maximum(widths, 1)
+    heights = np.maximum(heights, 1)
+    widths_ceil = np.ceil(widths)
+    heights_ceil = np.ceil(heights)
+
+    # NCHW to NHWC for use with OpenCV
+    maps = np.transpose(maps, [0, 2, 3, 1])
+    min_size = 0  # cfg.KRCNN.INFERENCE_MIN_SIZE
+    num_keypoints = maps.shape[3]
+    xy_preds = np.zeros((len(rois), 3, num_keypoints), dtype=np.float32)
+    end_scores = np.zeros((len(rois), num_keypoints), dtype=np.float32)
+    for i in range(len(rois)):
+        if min_size > 0:
+            roi_map_width = int(np.maximum(widths_ceil[i], min_size))
+            roi_map_height = int(np.maximum(heights_ceil[i], min_size))
+        else:
+            roi_map_width = widths_ceil[i]
+            roi_map_height = heights_ceil[i]
+        width_correction = widths[i] / roi_map_width
+        height_correction = heights[i] / roi_map_height
+        roi_map = cv2.resize(
+            maps[i], (roi_map_width, roi_map_height), interpolation=cv2.INTER_CUBIC
+        )
+        # Bring back to CHW
+        roi_map = np.transpose(roi_map, [2, 0, 1])
+        # roi_map_probs = scores_to_probs(roi_map.copy())
+        w = roi_map.shape[2]
+        pos = roi_map.reshape(num_keypoints, -1).argmax(axis=1)
+        x_int = pos % w
+        y_int = (pos - x_int) // w
+        # assert (roi_map_probs[k, y_int, x_int] ==
+        #         roi_map_probs[k, :, :].max())
+        x = (x_int + 0.5) * width_correction
+        y = (y_int + 0.5) * height_correction
+        xy_preds[i, 0, :] = x + offset_x[i]
+        xy_preds[i, 1, :] = y + offset_y[i]
+        xy_preds[i, 2, :] = 1
+        end_scores[i, :] = roi_map[np.arange(num_keypoints), y_int, x_int]
+
+    return np.transpose(xy_preds, [0, 2, 1]), end_scores
+
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
+
+
+class Keypointer(object):
+    """
+    Projects a set of masks in an image on the locations
+    specified by the bounding boxes
+    """
+
+    def __init__(self, padding=0):
+        self.padding = padding
+
+    def __call__(self, masks, boxes):
+        # TODO do this properly
+        if isinstance(boxes, BoxList):
+            boxes = [boxes]
+        assert len(boxes) == 1
+
+        result, scores = heatmaps_to_keypoints(
+            masks.detach().cpu().numpy(), boxes[0].bbox.cpu().numpy()
+        )
+        return torch.from_numpy(result).to(masks.device), torch.as_tensor(scores, device=masks.device)
+
+
+def make_roi_keypoint_post_processor(cfg):
+    keypointer = Keypointer()
+    keypoint_post_processor = KeypointPostProcessor(keypointer)
+    return keypoint_post_processor
--- a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py
+import torch
+
+from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor
+from .roi_keypoint_predictors import make_roi_keypoint_predictor
+from .inference import make_roi_keypoint_post_processor
+from .loss import make_roi_keypoint_loss_evaluator
+
+
+class ROIKeypointHead(torch.nn.Module):
+    def __init__(self, cfg):
+        super(ROIKeypointHead, self).__init__()
+        self.cfg = cfg.clone()
+        self.feature_extractor = make_roi_keypoint_feature_extractor(cfg)
+        self.predictor = make_roi_keypoint_predictor(cfg)
+        self.post_processor = make_roi_keypoint_post_processor(cfg)
+        self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg)
+
+    def forward(self, features, proposals, targets=None):
+        """
+        Arguments:
+            features (list[Tensor]): feature-maps from possibly several levels
+            proposals (list[BoxList]): proposal boxes
+            targets (list[BoxList], optional): the ground-truth targets.
+
+        Returns:
+            x (Tensor): the result of the feature extractor
+            proposals (list[BoxList]): during training, the original proposals
+                are returned. During testing, the predicted boxlists are returned
+                with the `mask` field set
+            losses (dict[Tensor]): During training, returns the losses for the
+                head. During testing, returns an empty dict.
+        """
+        if self.training:
+            with torch.no_grad():
+                proposals = self.loss_evaluator.subsample(proposals, targets)
+
+        x = self.feature_extractor(features, proposals)
+        kp_logits = self.predictor(x)
+
+        if not self.training:
+            result = self.post_processor(kp_logits, proposals)
+            return x, result, {}
+
+        loss_kp = self.loss_evaluator(proposals, kp_logits)
+
+        return x, proposals, dict(loss_kp=loss_kp)
+
+
+def build_roi_keypoint_head(cfg):
+    return ROIKeypointHead(cfg)
--- a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py
+import torch
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.modeling.matcher import Matcher
+
+from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import (
+    BalancedPositiveNegativeSampler,
+)
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.modeling.utils import cat
+from maskrcnn_benchmark.layers import smooth_l1_loss
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+
+from maskrcnn_benchmark.structures.keypoint import keypoints_to_heat_map
+
+
+def project_keypoints_to_heatmap(keypoints, proposals, discretization_size):
+    proposals = proposals.convert("xyxy")
+    return keypoints_to_heat_map(
+        keypoints.keypoints, proposals.bbox, discretization_size
+    )
+
+
+def cat_boxlist_with_keypoints(boxlists):
+    assert all(boxlist.has_field("keypoints") for boxlist in boxlists)
+
+    kp = [boxlist.get_field("keypoints").keypoints for boxlist in boxlists]
+    kp = cat(kp, 0)
+
+    fields = boxlists[0].get_fields()
+    fields = [field for field in fields if field != "keypoints"]
+
+    boxlists = [boxlist.copy_with_fields(fields) for boxlist in boxlists]
+    boxlists = cat_boxlist(boxlists)
+    boxlists.add_field("keypoints", kp)
+    return boxlists
+
+
+def _within_box(points, boxes):
+    """Validate which keypoints are contained inside a given box.
+    points: NxKx2
+    boxes: Nx4
+    output: NxK
+    """
+    x_within = (points[..., 0] >= boxes[:, 0, None]) & (
+        points[..., 0] <= boxes[:, 2, None]
+    )
+    y_within = (points[..., 1] >= boxes[:, 1, None]) & (
+        points[..., 1] <= boxes[:, 3, None]
+    )
+    return x_within & y_within
+
+
+class KeypointRCNNLossComputation(object):
+    def __init__(self, proposal_matcher, fg_bg_sampler, discretization_size):
+        """
+        Arguments:
+            proposal_matcher (Matcher)
+            fg_bg_sampler (BalancedPositiveNegativeSampler)
+            discretization_size (int)
+        """
+        self.proposal_matcher = proposal_matcher
+        self.fg_bg_sampler = fg_bg_sampler
+        self.discretization_size = discretization_size
+
+    def match_targets_to_proposals(self, proposal, target):
+        match_quality_matrix = boxlist_iou(target, proposal)
+        matched_idxs = self.proposal_matcher(match_quality_matrix)
+        # Keypoint RCNN needs "labels" and "keypoints "fields for creating the targets
+        target = target.copy_with_fields(["labels", "keypoints"])
+        # get the targets corresponding GT for each proposal
+        # NB: need to clamp the indices because we can have a single
+        # GT in the image, and matched_idxs can be -2, which goes
+        # out of bounds
+        matched_targets = target[matched_idxs.clamp(min=0)]
+        matched_targets.add_field("matched_idxs", matched_idxs)
+        return matched_targets
+
+    def prepare_targets(self, proposals, targets):
+        labels = []
+        keypoints = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            matched_targets = self.match_targets_to_proposals(
+                proposals_per_image, targets_per_image
+            )
+            matched_idxs = matched_targets.get_field("matched_idxs")
+
+            labels_per_image = matched_targets.get_field("labels")
+            labels_per_image = labels_per_image.to(dtype=torch.int64)
+
+            # this can probably be removed, but is left here for clarity
+            # and completeness
+            # TODO check if this is the right one, as BELOW_THRESHOLD
+            neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
+            labels_per_image[neg_inds] = 0
+
+            keypoints_per_image = matched_targets.get_field("keypoints")
+            within_box = _within_box(
+                keypoints_per_image.keypoints, matched_targets.bbox
+            )
+            vis_kp = keypoints_per_image.keypoints[..., 2] > 0
+            is_visible = (within_box & vis_kp).sum(1) > 0
+
+            labels_per_image[~is_visible] = -1
+
+            labels.append(labels_per_image)
+            keypoints.append(keypoints_per_image)
+
+        return labels, keypoints
+
+    def subsample(self, proposals, targets):
+        """
+        This method performs the positive/negative sampling, and return
+        the sampled proposals.
+        Note: this function keeps a state.
+
+        Arguments:
+            proposals (list[BoxList])
+            targets (list[BoxList])
+        """
+
+        labels, keypoints = self.prepare_targets(proposals, targets)
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+
+        proposals = list(proposals)
+        # add corresponding label and regression_targets information to the bounding boxes
+        for labels_per_image, keypoints_per_image, proposals_per_image in zip(
+            labels, keypoints, proposals
+        ):
+            proposals_per_image.add_field("labels", labels_per_image)
+            proposals_per_image.add_field("keypoints", keypoints_per_image)
+
+        # distributed sampled proposals, that were obtained on all feature maps
+        # concatenated via the fg_bg_sampler, into individual feature map levels
+        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
+            zip(sampled_pos_inds, sampled_neg_inds)
+        ):
+            img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1)
+            proposals_per_image = proposals[img_idx][img_sampled_inds]
+            proposals[img_idx] = proposals_per_image
+
+        self._proposals = proposals
+        return proposals
+
+    def __call__(self, proposals, keypoint_logits):
+        heatmaps = []
+        valid = []
+        for proposals_per_image in proposals:
+            kp = proposals_per_image.get_field("keypoints")
+            heatmaps_per_image, valid_per_image = project_keypoints_to_heatmap(
+                kp, proposals_per_image, self.discretization_size
+            )
+            heatmaps.append(heatmaps_per_image.view(-1))
+            valid.append(valid_per_image.view(-1))
+
+        keypoint_targets = cat(heatmaps, dim=0)
+        valid = cat(valid, dim=0).to(dtype=torch.uint8)
+        valid = torch.nonzero(valid).squeeze(1)
+
+        # torch.mean (in binary_cross_entropy_with_logits) does'nt
+        # accept empty tensors, so handle it sepaartely
+        if keypoint_targets.numel() == 0 or len(valid) == 0:
+            return keypoint_logits.sum() * 0
+
+        N, K, H, W = keypoint_logits.shape
+        keypoint_logits = keypoint_logits.view(N * K, H * W)
+
+        keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
+        return keypoint_loss
+
+
+def make_roi_keypoint_loss_evaluator(cfg):
+    matcher = Matcher(
+        cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD,
+        cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD,
+        allow_low_quality_matches=False,
+    )
+    fg_bg_sampler = BalancedPositiveNegativeSampler(
+        cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+    )
+    resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION
+    loss_evaluator = KeypointRCNNLossComputation(matcher, fg_bg_sampler, resolution)
+    return loss_evaluator
--- a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py
+from torch import nn
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.modeling.poolers import Pooler
+
+from maskrcnn_benchmark.layers import Conv2d
+
+
+class KeypointRCNNFeatureExtractor(nn.Module):
+    def __init__(self, cfg):
+        super(KeypointRCNNFeatureExtractor, self).__init__()
+
+        resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
+        scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES
+        sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
+        pooler = Pooler(
+            output_size=(resolution, resolution),
+            scales=scales,
+            sampling_ratio=sampling_ratio,
+        )
+        self.pooler = pooler
+
+        input_features = cfg.MODEL.BACKBONE.OUT_CHANNELS
+        layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS
+        next_feature = input_features
+        self.blocks = []
+        for layer_idx, layer_features in enumerate(layers, 1):
+            layer_name = "conv_fcn{}".format(layer_idx)
+            module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1)
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            nn.init.constant_(module.bias, 0)
+            self.add_module(layer_name, module)
+            next_feature = layer_features
+            self.blocks.append(layer_name)
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+        for layer_name in self.blocks:
+            x = F.relu(getattr(self, layer_name)(x))
+        return x
+
+
+_ROI_KEYPOINT_FEATURE_EXTRACTORS = {
+    "KeypointRCNNFeatureExtractor": KeypointRCNNFeatureExtractor
+}
+
+
+def make_roi_keypoint_feature_extractor(cfg):
+    func = _ROI_KEYPOINT_FEATURE_EXTRACTORS[
+        cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR
+    ]
+    return func(cfg)
--- a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py
+from torch import nn
+from torch.nn import functional as F
+
+from maskrcnn_benchmark import layers
+
+
+class KeypointRCNNPredictor(nn.Module):
+    def __init__(self, cfg):
+        super(KeypointRCNNPredictor, self).__init__()
+        input_features = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS[-1]
+        num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES
+        deconv_kernel = 4
+        self.kps_score_lowres = layers.ConvTranspose2d(
+            input_features,
+            num_keypoints,
+            deconv_kernel,
+            stride=2,
+            padding=deconv_kernel // 2 - 1,
+        )
+        nn.init.kaiming_normal_(
+            self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu"
+        )
+        nn.init.constant_(self.kps_score_lowres.bias, 0)
+        self.up_scale = 2
+
+    def forward(self, x):
+        x = self.kps_score_lowres(x)
+        x = layers.interpolate(
+            x, scale_factor=self.up_scale, mode="bilinear", align_corners=False
+        )
+        return x
+
+
+_ROI_KEYPOINT_PREDICTOR = {"KeypointRCNNPredictor": KeypointRCNNPredictor}
+
+
+def make_roi_keypoint_predictor(cfg):
+    func = _ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR]
+    return func(cfg)
--- a/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py
@@ -3,6 +3,7 @@ import torch

 from .box_head.box_head import build_roi_box_head
 from .mask_head.mask_head import build_roi_mask_head
+from .keypoint_head.keypoint_head import build_roi_keypoint_head


 class CombinedROIHeads(torch.nn.ModuleDict):
@@ -16,6 +17,8 @@ class CombinedROIHeads(torch.nn.ModuleDict):
        self.cfg = cfg.clone()
        if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
            self.mask.feature_extractor = self.box.feature_extractor
+        if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
+            self.keypoint.feature_extractor = self.box.feature_extractor

    def forward(self, features, proposals, targets=None):
        losses = {}
@@ -35,6 +38,20 @@ class CombinedROIHeads(torch.nn.ModuleDict):
            # this makes the API consistent during training and testing
            x, detections, loss_mask = self.mask(mask_features, detections, targets)
            losses.update(loss_mask)
+
+        if self.cfg.MODEL.KEYPOINT_ON:
+            keypoint_features = features
+            # optimization: during training, if we share the feature extractor between
+            # the box and the mask heads, then we can reuse the features already computed
+            if (
+                self.training
+                and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
+            ):
+                keypoint_features = x
+            # During training, self.box() will return the unaltered proposals as "detections"
+            # this makes the API consistent during training and testing
+            x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets)
+            losses.update(loss_keypoint)
        return x, detections, losses


@@ -46,6 +63,8 @@ def build_roi_heads(cfg):
        roi_heads.append(("box", build_roi_box_head(cfg)))
    if cfg.MODEL.MASK_ON:
        roi_heads.append(("mask", build_roi_mask_head(cfg)))
+    if cfg.MODEL.KEYPOINT_ON:
+        roi_heads.append(("keypoint", build_roi_keypoint_head(cfg)))

    # combine individual heads in a single module
    if roi_heads:

--- a/maskrcnn_benchmark/structures/keypoint.py
+++ b/maskrcnn_benchmark/structures/keypoint.py
+import torch
+
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+class Keypoints(object):
+    def __init__(self, keypoints, size, mode=None):
+        # FIXME remove check once we have better integration with device
+        # in my version this would consistently return a CPU tensor
+        device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device('cpu')
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        num_keypoints = keypoints.shape[0]
+        if num_keypoints:
+            keypoints = keypoints.view(num_keypoints, -1, 3)
+        
+        # TODO should I split them?
+        # self.visibility = keypoints[..., 2]
+        self.keypoints = keypoints# [..., :2]
+
+        self.size = size
+        self.mode = mode
+        self.extra_fields = {}
+
+    def crop(self, box):
+        raise NotImplementedError()
+
+    def resize(self, size, *args, **kwargs):
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+        ratio_w, ratio_h = ratios
+        resized_data = self.keypoints.clone()
+        resized_data[..., 0] *= ratio_w
+        resized_data[..., 1] *= ratio_h
+        keypoints = type(self)(resized_data, size, self.mode)
+        for k, v in self.extra_fields.items():
+            keypoints.add_field(k, v)
+        return keypoints
+
+    def transpose(self, method):
+        if method not in (FLIP_LEFT_RIGHT,):
+            raise NotImplementedError(
+                    "Only FLIP_LEFT_RIGHT implemented")
+
+        flip_inds = type(self).FLIP_INDS
+        flipped_data = self.keypoints[:, flip_inds]
+        width = self.size[0]
+        TO_REMOVE = 1
+        # Flip x coordinates
+        flipped_data[..., 0] = width - flipped_data[..., 0] - TO_REMOVE
+
+        # Maintain COCO convention that if visibility == 0, then x, y = 0
+        inds = flipped_data[..., 2] == 0
+        flipped_data[inds] = 0
+
+        keypoints = type(self)(flipped_data, self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            keypoints.add_field(k, v)
+        return keypoints
+
+    def to(self, *args, **kwargs):
+        keypoints = type(self)(self.keypoints.to(*args, **kwargs), self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            if hasattr(v, "to"):
+                v = v.to(*args, **kwargs)
+            keypoints.add_field(k, v)
+        return keypoints
+
+    def __getitem__(self, item):
+        keypoints = type(self)(self.keypoints[item], self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            keypoints.add_field(k, v[item])
+        return keypoints
+
+    def add_field(self, field, field_data):
+        self.extra_fields[field] = field_data
+
+    def get_field(self, field):
+        return self.extra_fields[field]
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'num_instances={}, '.format(len(self.keypoints))
+        s += 'image_width={}, '.format(self.size[0])
+        s += 'image_height={})'.format(self.size[1])
+        return s
+
+
+def _create_flip_indices(names, flip_map):
+    full_flip_map = flip_map.copy()
+    full_flip_map.update({v: k for k, v in flip_map.items()})
+    flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names]
+    flip_indices = [names.index(i) for i in flipped_names]
+    return torch.tensor(flip_indices)
+
+
+class PersonKeypoints(Keypoints):
+    NAMES = [
+        'nose',
+        'left_eye',
+        'right_eye',
+        'left_ear',
+        'right_ear',
+        'left_shoulder',
+        'right_shoulder',
+        'left_elbow',
+        'right_elbow',
+        'left_wrist',
+        'right_wrist',
+        'left_hip',
+        'right_hip',
+        'left_knee',
+        'right_knee',
+        'left_ankle',
+        'right_ankle'
+    ]
+    FLIP_MAP = {
+        'left_eye': 'right_eye',
+        'left_ear': 'right_ear',
+        'left_shoulder': 'right_shoulder',
+        'left_elbow': 'right_elbow',
+        'left_wrist': 'right_wrist',
+        'left_hip': 'right_hip',
+        'left_knee': 'right_knee',
+        'left_ankle': 'right_ankle'
+    }
+
+
+# TODO this doesn't look great
+PersonKeypoints.FLIP_INDS = _create_flip_indices(PersonKeypoints.NAMES, PersonKeypoints.FLIP_MAP)
+def kp_connections(keypoints):
+    kp_lines = [
+        [keypoints.index('left_eye'), keypoints.index('right_eye')],
+        [keypoints.index('left_eye'), keypoints.index('nose')],
+        [keypoints.index('right_eye'), keypoints.index('nose')],
+        [keypoints.index('right_eye'), keypoints.index('right_ear')],
+        [keypoints.index('left_eye'), keypoints.index('left_ear')],
+        [keypoints.index('right_shoulder'), keypoints.index('right_elbow')],
+        [keypoints.index('right_elbow'), keypoints.index('right_wrist')],
+        [keypoints.index('left_shoulder'), keypoints.index('left_elbow')],
+        [keypoints.index('left_elbow'), keypoints.index('left_wrist')],
+        [keypoints.index('right_hip'), keypoints.index('right_knee')],
+        [keypoints.index('right_knee'), keypoints.index('right_ankle')],
+        [keypoints.index('left_hip'), keypoints.index('left_knee')],
+        [keypoints.index('left_knee'), keypoints.index('left_ankle')],
+        [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')],
+        [keypoints.index('right_hip'), keypoints.index('left_hip')],
+    ]
+    return kp_lines
+PersonKeypoints.CONNECTIONS = kp_connections(PersonKeypoints.NAMES)
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def keypoints_to_heat_map(keypoints, rois, heatmap_size):
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+    
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
--- a/tools/test_net.py
+++ b/tools/test_net.py
@@ -68,6 +68,8 @@ def main():
    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)
+    if cfg.MODEL.KEYPOINT_ON:
+        iou_types = iou_types + ("keypoints",)
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:

--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -83,6 +83,8 @@ def test(cfg, model, distributed):
    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)
+    if cfg.MODEL.KEYPOINT_ON:
+        iou_types = iou_types + ("keypoints",)
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR: