cam_utils.py

import numpy as np
import cv2
import os
import sys
import glob
from ppdet.utils.logger import setup_logger
import copy
logger = setup_logger('ppdet_cam')

import paddle
from ppdet.engine import Trainer


def get_test_images(infer_dir, infer_img):
    """
    Get image path list in TEST mode
    """
    assert infer_img is not None or infer_dir is not None, \
        "--infer_img or --infer_dir should be set"
    assert infer_img is None or os.path.isfile(infer_img), \
            "{} is not a file".format(infer_img)
    assert infer_dir is None or os.path.isdir(infer_dir), \
            "{} is not a directory".format(infer_dir)

    # infer_img has a higher priority
    if infer_img and os.path.isfile(infer_img):
        return [infer_img]

    images = set()
    infer_dir = os.path.abspath(infer_dir)
    assert os.path.isdir(infer_dir), \
        "infer_dir {} is not a directory".format(infer_dir)
    exts = ['jpg', 'jpeg', 'png', 'bmp']
    exts += [ext.upper() for ext in exts]
    for ext in exts:
        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
    images = list(images)

    assert len(images) > 0, "no image found in {}".format(infer_dir)
    logger.info("Found {} inference images in total.".format(len(images)))

    return images


def compute_ious(boxes1, boxes2):
    """[Compute pairwise IOU matrix for given two sets of boxes]

        Args:
            boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
            boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
        Returns:
            pairwise IOU maxtrix with shape (N,M)，where the value at ith row jth column hold the iou between ith
            box and jth box from box1 and box2 respectively.
    """
    lu = np.maximum(
        boxes1[:, None, :2], boxes2[:, :2]
    )  # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)
    rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # rd same to lu
    intersection_wh = np.maximum(0.0, rd - lu)
    intersection_area = intersection_wh[:, :,
                                        0] * intersection_wh[:, :,
                                                             1]  # with shape (N,M)
    boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])
    boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1]  # with shape (N,)
    boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])
    boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1]  # with shape (M,)
    union_area = np.maximum(
        boxes1_area[:, None] + boxes2_area - intersection_area,
        1e-8)  # with shape (N,M)
    ious = np.clip(intersection_area / union_area, 0.0, 1.0)
    return ious


def grad_cam(feat, grad):
    """

    Args:
        feat:  CxHxW
        grad:  CxHxW

    Returns:
           cam: HxW
    """
    exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)
    exp = np.maximum(-exp, 0)
    return exp


def resize_cam(explanation, resize_shape) -> np.ndarray:
    """

    Args:
        explanation: (width, height)
        resize_shape: (width, height)

    Returns:

    """
    assert len(explanation.shape) == 2, f"{explanation.shape}. " \
                                        f"Currently support 2D explanation results for visualization. " \
                                        "Reduce higher dimensions to 2D for visualization."

    explanation = (explanation - explanation.min()) / (
        explanation.max() - explanation.min())

    explanation = cv2.resize(explanation, resize_shape)
    explanation = np.uint8(255 * explanation)
    explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)
    explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)

    return explanation


class BBoxCAM:
    def __init__(self, FLAGS, cfg):
        self.FLAGS = FLAGS
        self.cfg = cfg
        # build model
        self.trainer = self.build_trainer(cfg)
        # num_class
        self.num_class = cfg.num_classes
        # set hook for extraction of featuremaps and grads
        self.set_hook(cfg)
        self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN']
        """
        In these networks, the bbox array shape before nms contain num_class,
        the nms_keep_idx of the bbox need to divide the num_class; 
        """

        # cam image output_dir
        try:
            os.makedirs(FLAGS.cam_out)
        except:
            print('Path already exists.')
            pass

    def build_trainer(self, cfg):
        # build trainer
        trainer = Trainer(cfg, mode='test')
        # load weights
        trainer.load_weights(cfg.weights)

        # set for get extra_data before nms
        trainer.model.use_extra_data=True
        # set for record the bbox index before nms
        if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
            trainer.model.bbox_post_process.nms.return_index = True
        elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:
            if trainer.model.post_process is not None:
                # anchor based YOLOs: YOLOv3,PP-YOLO
                trainer.model.post_process.nms.return_index = True
            else:
                # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
                trainer.model.yolo_head.nms.return_index = True
        elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD':
            trainer.model.post_process.nms.return_index = True
        elif cfg.architecture=='RetinaNet':
            trainer.model.head.nms.return_index = True
        else:
            print(
                cfg.architecture+' is not supported for cam temporarily!'
            )
            sys.exit()
        # Todo: Unify the head/post_process name in each model

        return trainer

    def set_hook(self, cfg):
        # set hook for extraction of featuremaps and grads
        self.target_feats = {}
        self.target_layer_name = cfg.target_feature_layer_name
        # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor

        def hook(layer, input, output):
            self.target_feats[layer._layer_name_for_hook] = output

        try:
            exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name')
            # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
            exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)')
            # self.trainer.target_layer_name.register_forward_post_hook(hook)
        except:
            print("Error! "
                  "The target_layer_name--"+self.target_layer_name+" is not in model! "
                  "Please check the spelling and "
                  "the network's architecture!")
            sys.exit()

    def get_bboxes(self):
        # get inference images
        images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)

        # inference
        result = self.trainer.predict(
            images,
            draw_threshold=self.FLAGS.draw_threshold,
            output_dir=self.FLAGS.output_dir,
            save_results=self.FLAGS.save_results,
            visualize=False)[0]
        return result

    def get_bboxes_cams(self):
        # Get the bboxes prediction(after nms result) of the input
        inference_result = self.get_bboxes()

        # read input image
        # Todo: Support folder multi-images process
        from PIL import Image
        img = np.array(Image.open(self.cfg.infer_img))

        # data for calaulate bbox grad_cam
        extra_data = inference_result['extra_data']
        """
        Example of Faster_RCNN based architecture:
            extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
                       'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]
                      }
        Example of YOLOv3 based architecture:
            extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]
                       'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]
                      }
        """

        # array index of the predicted bbox before nms
        if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:
            # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],
            # we need to divide num_classes to get the before_nms_index；
            # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);
            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
            ) // self.num_class  # num_class
        else :
            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()

        # Calculate and visualize the heatmap of per predict bbox
        for index, target_bbox in enumerate(inference_result['bbox']):
            # target_bbox: [cls, score, x1, y1, x2, y2]
            # filter bboxes with low predicted scores
            if target_bbox[1] < self.FLAGS.draw_threshold:
                continue

            target_bbox_before_nms = int(before_nms_indexes[index])

            if len(extra_data['scores'].shape)==2:
                score_out = extra_data['scores'][target_bbox_before_nms]
            else:
                score_out = extra_data['scores'][0, :, target_bbox_before_nms]
            """
            There are two kinds array shape of bbox score output :
                1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
                2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
            """


            # construct one_hot label and do backward to get the gradients
            predicted_label = paddle.argmax(score_out)
            label_onehot = paddle.nn.functional.one_hot(
                predicted_label, num_classes=len(score_out))
            label_onehot = label_onehot.squeeze()
            target = paddle.sum(score_out * label_onehot)
            target.backward(retain_graph=True)


            if 'backbone' in self.target_layer_name or \
                    'neck' in self.target_layer_name: # backbone/neck level feature
                if isinstance(self.target_feats[self.target_layer_name], list):
                    # when the featuremap contains of multiple scales,
                    # take the featuremap of the last scale
                    # Todo: fuse the cam result from multisclae featuremaps
                    if self.target_feats[self.target_layer_name][
                            -1].shape[-1]==1:
                        """
                        if the last level featuremap is 1x1 size,
                        we take the second last one
                        """
                        cam_grad = self.target_feats[self.target_layer_name][
                            -2].grad.squeeze().cpu().numpy()
                        cam_feat = self.target_feats[self.target_layer_name][
                            -2].squeeze().cpu().numpy()
                    else:
                        cam_grad = self.target_feats[self.target_layer_name][
                            -1].grad.squeeze().cpu().numpy()
                        cam_feat = self.target_feats[self.target_layer_name][
                            -1].squeeze().cpu().numpy()
                else:
                    cam_grad = self.target_feats[
                        self.target_layer_name].grad.squeeze().cpu().numpy()
                    cam_feat = self.target_feats[
                        self.target_layer_name].squeeze().cpu().numpy()
            else: # roi level feature
                cam_grad = self.target_feats[
                    self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms]
                cam_feat = self.target_feats[
                    self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms]

            # grad_cam:
            exp = grad_cam(cam_feat, cam_grad)

            if 'backbone' in self.target_layer_name or \
                    'neck' in self.target_layer_name:
                """
                when use backbone/neck featuremap, 
                we first do the cam on whole image, 
                and then set the area outside the predic bbox to 0
                """
                # reshape the cam image to the input image size
                resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
                mask = np.zeros((img.shape[0], img.shape[1], 3))
                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]):
                     int(target_bbox[4]), :] = 1
                resized_exp = resized_exp * mask
                # add the bbox cam back to the input image
                overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
            elif 'roi' in self.target_layer_name:
                # get the bbox part of the image
                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]),
                                         int(target_bbox[2]):int(target_bbox[4]), :])
                # reshape the cam image to the bbox size
                resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0]))
                # add the bbox cam back to the bbox image
                bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
                # put the bbox_cam image to the original image
                overlay_vis = copy.deepcopy(img)
                overlay_vis[int(target_bbox[3]):int(target_bbox[5]),
                    int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
            else:
                print(
                    'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'
                )
                sys.exit()

            # put the bbox rectangle on image
            cv2.rectangle(
                overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),
                (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)

            # save visualization result
            cam_image = Image.fromarray(overlay_vis)
            cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')

            # clear gradients after each bbox grad_cam
            target.clear_gradient()
            for n, v in self.trainer.model.named_sublayers():
                v.clear_gradients()