complete the backbone of pytorch yolov4/v5

modify ResizeAndKeepRatio finish eval code add module in __init__.py fix bugs in code modify name in model modify code finish debuging inference code

complete the backbone of pytorch yolov4/v5
modify ResizeAndKeepRatio finish eval code add module in __init__.py fix bugs in code modify name in model modify code finish debuging inference code
2ae4ac30 · wangxinxin08 · 3d530cbf · 2ae4ac30 · 2ae4ac30 · 2ae4ac30
13 changed file
--- a/configs/yolov5/yolov4-mish.yml
+++ b/configs/yolov5/yolov4-mish.yml
+architecture: YOLOv5
+use_gpu: true
+max_iters: 85000
+log_smooth_window: 1
+save_dir: output
+snapshot_iter: 5000
+metric: COCO
+pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/ResNet50_vd_dcn_db_obj365_pretrained.tar
+weights: output/yolov3_r50vd_dcn_db_iouaware_obj365_pretrained_coco/model_final
+use_fine_grained_loss: false
+num_classes: 80
+
+YOLOv5:
+  backbone: CSPYolo
+  yolo_head: YOLOv5Head
+  use_fine_grained_loss: false
+
+CSPYolo:
+  depth_multiple: 1.33
+  width_multiple: 1.25
+  act: 'mish'
+  yolov5: false
+  save: [22, 26, 30]
+  weight_prefix_name: 'model'
+  layers: [
+   [-1, 1, 'Conv', [32, 3, 1]],  # 0
+   [-1, 1, 'Conv', [64, 3, 2]],  # 1-P1/2
+   [-1, 1, 'Bottleneck', [64]],
+   [-1, 1, 'Conv', [128, 3, 2]],  # 3-P2/4
+   [-1, 2, 'BottleneckCSP', [128]],
+   [-1, 1, 'Conv', [256, 3, 2]],  # 5-P3/8
+   [-1, 8, 'BottleneckCSP', [256]],
+   [-1, 1, 'Conv', [512, 3, 2]],  # 7-P4/16
+   [-1, 8, 'BottleneckCSP', [512]],
+   [-1, 1, 'Conv', [1024, 3, 2]], # 9-P5/32
+   [-1, 4, 'BottleneckCSP', [1024]],  # 10
+  ]
+  neck: [
+   [-1, 1, 'SPPCSP', [512]], # 11
+   [-1, 1, 'Conv', [256, 1, 1]],
+   [-1, 1, 'Upsample', ['None', 2, 'nearest']],
+   [8, 1, 'Conv', [256, 1, 1]], # route backbone P4
+   [[-1, -2], 1, 'Concat', [1]],
+   [-1, 2, 'BottleneckCSP2', [256]], # 16 
+   [-1, 1, 'Conv', [128, 1, 1]],
+   [-1, 1, 'Upsample', ['None', 2, 'nearest']],
+   [6, 1, 'Conv', [128, 1, 1]], # route backbone P3
+   [[-1, -2], 1, 'Concat', [1]],
+   [-1, 2, 'BottleneckCSP2', [128]], # 21
+   [-1, 1, 'Conv', [256, 3, 1]],
+   [-2, 1, 'Conv', [256, 3, 2]],
+   [[-1, 16], 1, 'Concat', [1]],  # cat
+   [-1, 2, 'BottleneckCSP2', [256]], # 25
+   [-1, 1, 'Conv', [512, 3, 1]],
+   [-2, 1, 'Conv', [512, 3, 2]],
+   [[-1, 11], 1, 'Concat', [1]],  # cat
+   [-1, 2, 'BottleneckCSP2', [512]], # 29
+   [-1, 1, 'Conv', [1024, 3, 1]]
+  ]
+
+YOLOv5Head:
+  anchors: [[12, 16], [19, 36], [40, 28], [36, 75], [76, 55],
+            [72, 146], [142, 110], [192, 243], [459, 401]]
+  anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+  yolo_loss: YOLOv3Loss
+  stride: [8, 16, 32]
+  start: 31
+  nms:
+    background_label: -1
+    keep_top_k: 300
+    nms_threshold: 0.65 #0.45
+    nms_top_k: -1
+    normalized: false
+    score_threshold: 0.001 #0.001
+  weight_prefix_name: 'model'
+
+
+YOLOv3Loss:
+  batch_size: 4
+  ignore_thresh: 0.7
+  label_smooth: false
+  use_fine_grained_loss: false
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    - 55000
+    - 75000
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 4000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+
+_READER_: 'yolov5_reader.yml'
--- a/configs/yolov5/yolov5.yml
+++ b/configs/yolov5/yolov5.yml
+architecture: YOLOv5
+use_gpu: true
+max_iters: 85000
+log_smooth_window: 1
+save_dir: output
+snapshot_iter: 5000
+metric: COCO
+pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/ResNet50_vd_dcn_db_obj365_pretrained.tar
+weights: output/yolov3_r50vd_dcn_db_iouaware_obj365_pretrained_coco/model_final
+use_fine_grained_loss: false
+num_classes: 80
+
+YOLOv5:
+  backbone: CSPYolo
+  yolo_head: YOLOv5Head
+  use_fine_grained_loss: false
+
+CSPYolo:
+  depth_multiple: 1.33
+  width_multiple: 1.25
+  act: 'hard_swish'
+  weight_prefix_name: 'model'
+
+YOLOv5Head:
+  anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+            [59, 119], [116, 90], [156, 198], [373, 326]]
+  anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+  yolo_loss: YOLOv3Loss
+  stride: [8, 16, 32]
+  nms:
+    background_label: -1
+    keep_top_k: 300
+    nms_threshold: 0.65 #0.45
+    nms_top_k: -1
+    normalized: false
+    score_threshold: 0.001 #0.001
+  weight_prefix_name: 'model'
+
+
+YOLOv3Loss:
+  batch_size: 4
+  ignore_thresh: 0.7
+  label_smooth: false
+  use_fine_grained_loss: false
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    - 55000
+    - 75000
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 4000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+
+_READER_: 'yolov5_reader.yml'
--- a/configs/yolov5/yolov5_reader.yml
+++ b/configs/yolov5/yolov5_reader.yml
+TrainReader:
+  inputs_def:
+    fields: ['image', 'gt_bbox', 'gt_class', 'gt_score']
+    num_max_boxes: 50
+  use_fine_grained_loss: true
+  dataset:
+    !COCODataSet
+    image_dir: train2017
+    anno_path: annotations/instances_train2017.json
+    dataset_dir: dataset/coco
+    with_background: false
+  sample_transforms:
+    - !DecodeImage
+      to_rgb: True
+      # with_mosaic: True
+    # - !MosaicImage
+    #   offset: 0.3
+    #   mosaic_scale: [0.8, 1.0]
+    #   sample_scale: [0.8, 1.0]
+    #   sample_flip: 0.5
+    #   use_cv2: true
+    #   interp: 2
+    - !NormalizeBox {}
+    - !PadBox
+      num_max_boxes: 50
+    - !BboxXYXY2XYWH {}
+  batch_transforms:
+    - !RandomShape
+      sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640]
+      random_inter: True
+    - !NormalizeImage
+      mean: [0.0, 0.0, 0.0]
+      std: [1.0, 1.0, 1.0]
+      is_scale: True
+      is_channel_first: false
+    - !Permute
+      to_bgr: false
+      channel_first: True
+      # focus: false
+    - !Gt2YoloTarget
+      anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+      anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
+            [59, 119], [116, 90], [156, 198], [373, 326]]
+      downsample_ratios: [8, 16, 32]
+  batch_size: 2
+  mosaic_prob: 0.3
+  mosaic_epoch: 8
+  shuffle: true
+  drop_last: true
+  worker_num: 8
+  bufsize: 16
+  use_process: true
+
+EvalReader:
+  inputs_def:
+    fields: ['image', 'im_size', 'im_id', 'im_pad', 'im_scale']
+    num_max_boxes: 50
+  dataset:
+    !COCODataSet
+    dataset_dir: dataset/coco
+    anno_path: annotations/instances_val2017.json
+    #anno_path: annotations/instances_val2017_debug_139.json
+    image_dir: val2017
+    with_background: false
+  sample_transforms:
+    - !DecodeImage
+      to_rgb: true
+      with_mixup: false
+    - !ResizeAndKeepRatio
+      target_size: 640
+      augment: false
+    - !LetterBox
+      target_size: 640
+      rect: true
+      auto: false
+      augment: false
+    - !NormalizeImage
+      mean: [0.0, 0.0, 0.0]
+      std: [1.0, 1.0, 1.0]
+      is_scale: true
+      is_channel_first: false
+    - !Permute
+      to_bgr: false
+      channel_first: true
+    - !PadBox
+      num_max_boxes: 50
+  batch_size: 1
+  drop_empty: false
+  worker_num: 8
+  bufsize: 16
+  target_size: 640
+  rect: true
+  pad: 0.5
+  stride: 32
+
+
+TestReader:
+  inputs_def:
+    fields: ['image', 'im_size', 'im_id', 'im_pad', 'im_scale']
+  dataset:
+    !ImageFolder
+      anno_path: annotations/instances_val2017.json
+      with_background: false
+  sample_transforms:
+    - !DecodeImage
+      to_rgb: True
+      with_mixup: false
+    - !ResizeAndKeepRatio
+      target_size: 640
+      augment: false
+    - !LetterBox
+      target_size: 672
+      rect: false
+      auto: false
+      augment: false
+    - !NormalizeImage
+      mean: [0.0, 0.0, 0.0]
+      std: [1.0, 1.0, 1.0]
+      is_scale: True
+      is_channel_first: false
+    - !Permute
+      to_bgr: false
+      channel_first: True
+  batch_size: 1
--- a/ppdet/data/reader.py
+++ b/ppdet/data/reader.py
@@ -202,12 +202,44 @@ class Reader(object):
                 use_fine_grained_loss=False,
                 num_classes=80,
                 bufsize=-1,
+                 target_size=640,
+                 rect=False,
+                 pad=0.5,
+                 stride=32,
                 memsize='3G',
                 inputs_def=None,
                 devices_num=1,
                 num_trainers=1):
        self._dataset = dataset
        self._roidbs = self._dataset.get_roidb()
+        if rect:
+            n = len(self._roidbs)
+            bi = np.floor(np.arange(n) / batch_size).astype(np.int)
+            nb = bi[-1] + 1
+            s = []
+            for i, rec in enumerate(self._roidbs):
+                s.append([rec['h'], rec['w']])
+            
+            s = np.array(s)
+            ar = s[:, 0] / s[:, 1] # h / w
+            irect = ar.argsort()
+            ar = ar[irect]
+
+            shapes = [[1, 1]] * nb
+            for i in range(nb):
+                ari = ar[bi == i]
+                mini, maxi = ari.min(), ari.max()
+                if maxi < 1:
+                    shapes[i] = [maxi, 1]
+                elif mini > 1:
+                    shapes[i] = [1, 1 / mini]
+            
+            batch_shapes = np.ceil(np.array(shapes) * target_size / stride + pad) * stride
+            new_roidbs = [self._roidbs[j] for j in irect]
+            self._roidbs = new_roidbs
+            for i, j in enumerate(bi):
+                self._roidbs[i].update({'new_shape': batch_shapes[j]})            
+                
        self._fields = copy.deepcopy(inputs_def[
            'fields']) if inputs_def else None


--- a/ppdet/data/transform/operators.py
+++ b/ppdet/data/transform/operators.py
@@ -372,6 +372,71 @@ class ResizeImage(BaseOperator):
        sample['image'] = im
        return sample

+@register_op
+class ResizeAndKeepRatio(BaseOperator):
+    def __init__(self, target_size, augment=False):
+        super(ResizeAndKeepRatio, self).__init__()
+        self.target_size = target_size
+        self.augment = augment
+
+    def __call__(self, sample, context=None):
+        im = sample['image']
+        h0, w0 = im.shape[:2]
+        r = self.target_size / max(h0, w0)
+        if r != 1:
+            interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
+            im = cv2.resize(im, (int(w0 * r), int(h0 * r)), interpolation=interp)
+        
+        sample['image'] = im
+        sample['im_size'] = [float(h0), float(w0)]
+        sample['im_scale'] = [1. / r, 1. / r]
+        return sample
+
+
+@register_op
+class LetterBox(BaseOperator):
+    def __init__(self, target_size, rect=True, color=(114, 114, 114), auto=True, scaleFill=False, augment=True):
+        super(LetterBox, self).__init__()
+        if isinstance(target_size, int):
+            target_size = (target_size, target_size)
+        self.target_size = target_size
+        self.color = color
+        self.auto = auto
+        self.scaleFill = scaleFill
+        self.augment = augment
+        self.rect = rect
+    
+    def __call__(self, sample, context=None):
+        im = sample['image']
+        shape = im.shape[:2]
+        new_shape = sample['new_shape'] if self.rect else self.target_size
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        if not self.augment:
+            r = min(r, 1.0)
+        
+        ratio = r, r
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+        if self.auto:  # minimum rectangle
+            dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
+        elif self.scaleFill:  # stretch
+            dw, dh = 0.0, 0.0
+            new_unpad = (new_shape[1], new_shape[0])
+            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+        dw /= 2  # divide padding into 2 sides
+        dh /= 2
+
+        if shape[::-1] != new_unpad:  # resize
+            im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+        im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=self.color)  # add border
+        sample['image'] = im
+        sample['im_pad'] = [dh, dw] 
+
+        return sample
+        

 @register_op
 class RandomFlipImage(BaseOperator):

--- a/ppdet/modeling/anchor_heads/__init__.py
+++ b/ppdet/modeling/anchor_heads/__init__.py
@@ -21,6 +21,7 @@ from . import fcos_head
 from . import corner_head
 from . import efficient_head
 from . import ttf_head
+from . import yolov5_head

 from .rpn_head import *
 from .yolo_head import *
@@ -29,3 +30,4 @@ from .fcos_head import *
 from .corner_head import *
 from .efficient_head import *
 from .ttf_head import *
+from .yolov5_head import *
--- a/ppdet/modeling/anchor_heads/yolov5_head.py
+++ b/ppdet/modeling/anchor_heads/yolov5_head.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from paddle import fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.regularizer import L2Decay
+
+from ppdet.modeling.ops import MultiClassNMS, MultiClassSoftNMS, MatrixNMS
+from ppdet.modeling.losses.yolo_loss import YOLOv3Loss
+from ppdet.core.workspace import register
+from ppdet.modeling.ops import DropBlock
+from .iou_aware import get_iou_aware_score
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from ppdet.utils.check import check_version
+
+__all__ = ['YOLOv5Head']
+
+
+@register
+class YOLOv5Head(object):
+
+    __inject__ = ['nms', 'yolo_loss']
+    __shared__ = ['num_classes', 'weight_prefix_name']
+
+    def __init__(self,
+                 anchors=[[12, 16], [19, 36], [40, 28], [36, 75], [76, 55],
+                          [72, 146], [142, 110], [192, 243], [459, 401]],
+                 anchor_masks=[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
+                 num_classes=80,
+                 yolo_loss="YOLOv3Loss",
+                 weight_prefix_name='',
+                 stride=[8, 16, 32],
+                 start=24,
+                 nms=MultiClassNMS(score_threshold=0.01,
+                                   nms_top_k=1000,
+                                   keep_top_k=100,
+                                   nms_threshold=0.45,
+                                   background_label=-1).__dict__):
+
+        self.anchors = self._parse_anchors(anchors, anchor_masks)
+        self.anchor_masks = anchor_masks
+        self.num_classes = num_classes
+        self.yolo_loss = yolo_loss
+        self.prefix = weight_prefix_name
+        self.stride = stride
+        self.start = start
+        if isinstance(nms, dict):
+           self.nms = MultiClassNMS(**nms)
+
+    def _create_tensor_from_numpy(self, numpy_array):
+        paddle_array = fluid.layers.create_global_var(shape=numpy_array.shape,
+                                                      value=0.,
+                                                      dtype=numpy_array.dtype)
+        fluid.layers.assign(numpy_array, paddle_array)
+        return paddle_array
+
+    def _parse_anchors(self, anchors, anchor_masks):
+        output = []
+        for anchor_mask in anchor_masks:
+            output.append(
+                [anchors[i] for i in anchor_mask]
+            )
+        return output
+
+    def _get_outputs(self, inputs):
+        outputs = []
+        for i, x in enumerate(inputs):
+            c_out = len(self.anchor_masks[i]) * (self.num_classes + 5)
+            output = fluid.layers.conv2d(
+                x,
+                c_out,
+                1,
+                1,
+                0,
+                act=None,
+                param_attr=ParamAttr(name=self.prefix +
+                                     '.{}.m.{}.weight'.format(self.start, i)),
+                bias_attr=ParamAttr(name=self.prefix +
+                                    '.{}.m.{}.bias'.format(self.start, i)))
+            outputs.append(output)
+
+        return outputs
+
+    def get_loss(self, inputs, gt_box, gt_label, gt_score, targets):
+        outputs = self._get_outputs(inputs)
+        return self.yolo_loss(outputs, gt_box, gt_label, gt_score, targets,
+                              self.anchors, self.anchor_masks,
+                              self.mask_anchors, self.num_classes,
+                              self.prefix_name)
+
+    def get_prediction(self, inputs, im_size, im_scale, im_pad, exclude_nms=False):
+        outputs = self._get_outputs(inputs)
+        boxes, scores = [], []
+        for i, output in enumerate(outputs):
+            output = fluid.layers.sigmoid(output)
+            output_shape = fluid.layers.shape(output)
+            bs, c, h, w = output_shape[0], output_shape[1], output_shape[2], output_shape[3]
+            na = len(self.anchor_masks[i])
+            no = self.num_classes + 5
+            output = fluid.layers.reshape(output, [bs, na, no, h, w])
+            output = fluid.layers.transpose(output, perm=[0, 1, 3, 4, 2])
+            grid = self._make_grid(w, h)
+            # decode
+            xy = (output[:, :, :, :, 0:2] * 2 - 0.5 + grid) * self.stride[i]
+            anchor = np.array(self.anchors[i]).reshape((1, 3, 1, 1, 2)).astype(np.float32)
+            anchor = self._create_tensor_from_numpy(anchor)
+            wh = (output[:, :, :, :, 2:4] * 2) ** 2 *  anchor
+            box = self._xywh2xxyy(xy, wh)
+            box = fluid.layers.reshape(box, (bs, -1, 4))
+            box = self._scale_box(box, im_scale, im_pad)
+            box = self._clip_box(box, im_size)
+            boxes.append(box)
+            # calculate prop
+            objectness = output[:, :, :, :, 4:5]
+            cls_p = output[:, :, :, :, 5:] * objectness
+            score = fluid.layers.reshape(cls_p, (bs, -1, self.num_classes))
+            scores.append(score)
+        
+        yolo_boxes = fluid.layers.concat(boxes, axis=1)
+        yolo_scores = fluid.layers.concat(scores, axis=1)
+        if exclude_nms:
+            return {'bbox': yolo_scores}
+        if type(self.nms) is not MultiClassSoftNMS:
+            yolo_scores = fluid.layers.transpose(yolo_scores, perm=[0, 2, 1])
+        pred = self.nms(bboxes=yolo_boxes, scores=yolo_scores)
+        return {'bbox': pred}
+    
+    def _make_grid(self, nx, ny):
+        start = self._create_tensor_from_numpy(np.array([0], dtype=np.int32))
+        step = self._create_tensor_from_numpy(np.array([1], dtype=np.int32))
+        yv, xv = fluid.layers.meshgrid([fluid.layers.arange(start, ny, step), fluid.layers.arange(start, nx, step)])
+        grid = fluid.layers.stack([xv, yv], axis=2)
+        return fluid.layers.reshape(grid, (1, 1, ny, nx, 2))
+
+    def _xywh2xxyy(self, xy, wh):
+        x1y1 = xy - wh / 2
+        x2y2 = xy + wh / 2
+        return fluid.layers.concat([x1y1, x2y2], axis=-1)
+
+    def _scale_box(self, box, im_scale, im_pad):
+        x1 = (box[:, :, 0:1] - im_pad[:, 1:2]) * im_scale[:, 1:2]
+        y1 = (box[:, :, 1:2] - im_pad[:, 0:1]) * im_scale[:, 0:1]
+        x2 = (box[:, :, 2:3] - im_pad[:, 1:2]) * im_scale[:, 1:2] - 1
+        y2 = (box[:, :, 3:4] - im_pad[:, 0:1]) * im_scale[:, 0:1] - 1
+        return fluid.layers.concat([x1, y1, x2, y2], axis=-1)
+    
+    def _clip_box(self, box, im_size):
+         bs = fluid.layers.shape(box)[0]
+         outputs = []
+         for i in range(1):
+             s = fluid.layers.cast(im_size[i], dtype=np.float32)
+             x1 = fluid.layers.clamp(box[i, :, 0:1], min=0., max=s[1])
+             y1 = fluid.layers.clamp(box[i, :, 1:2], min=0., max=s[0])
+             x2 = fluid.layers.clamp(box[i, :, 2:3], min=0., max=s[1])
+             y2 = fluid.layers.clamp(box[i, :, 3:4], min=0., max=s[0])
+             output = fluid.layers.concat([x1, y1, x2, y2], axis=-1)
+             outputs.append(output)
+         return fluid.layers.stack(outputs, axis=0)
--- a/ppdet/modeling/architectures/__init__.py
+++ b/ppdet/modeling/architectures/__init__.py
@@ -36,6 +36,7 @@ from .cascade_rcnn import *
 from .cascade_mask_rcnn import *
 from .cascade_rcnn_cls_aware import *
 from .yolo import *
+from .yolov5 import *
 from .ssd import *
 from .retinanet import *
 from .efficientdet import *

--- a/ppdet/modeling/architectures/yolov5.py
+++ b/ppdet/modeling/architectures/yolov5.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+
+from paddle import fluid
+
+from ppdet.experimental import mixed_precision_global_state
+from ppdet.core.workspace import register
+
+__all__ = ['YOLOv5']
+
+
+@register
+class YOLOv5(object):
+    """
+    YOLOv5 network 
+
+    Args:
+        backbone (object): an backbone instance
+        yolov5_head (object): an `YOLOv5Head` instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['backbone', 'yolo_head']
+    __shared__ = ['use_fine_grained_loss']
+
+    def __init__(self,
+                 backbone='CSPYolo',
+                 yolo_head='YOLOv5Head',
+                 use_fine_grained_loss=False):
+        super(YOLOv5, self).__init__()
+        self.backbone = backbone
+        self.yolo_head = yolo_head
+
+        self.use_fine_grained_loss = use_fine_grained_loss
+
+    def build(self, feed_vars, mode='train', exclude_nms=False):
+        im = feed_vars['image']
+        mixed_precision_enabled = mixed_precision_global_state() is not None
+
+        # cast inputs to FP16
+        if mixed_precision_enabled:
+            im = fluid.layers.cast(im, 'float16')
+
+        body_feats  = self.backbone(im)
+        if isinstance(body_feats, OrderedDict):
+            body_feat_names = list(body_feats.keys())
+            body_feats = [body_feats[name] for name in body_feat_names]
+
+        # cast features back to FP32
+        if mixed_precision_enabled:
+            body_feats = [fluid.layers.cast(v, 'float32') for v in body_feats]
+
+        if mode == 'train':
+            gt_bbox = feed_vars['gt_bbox']
+            gt_class = feed_vars['gt_class']
+            gt_score = feed_vars['gt_score']
+
+            # Get targets for splited yolo loss calculation
+            # YOLOv3 supports up to 3 output layers currently
+            targets = []
+            for i in range(3):
+                k = 'target{}'.format(i)
+                if k in feed_vars:
+                    targets.append(feed_vars[k])
+
+            loss = self.yolo_head.get_loss(body_feats, gt_bbox, gt_class,
+                                           gt_score, targets)
+            total_loss = fluid.layers.sum(list(loss.values()))
+            loss.update({'loss': total_loss})
+            return loss
+        else:
+            im_size = feed_vars['im_size']
+            im_pad = feed_vars['im_pad']
+            im_scale = feed_vars['im_scale']
+            return self.yolo_head.get_prediction(body_feats, im_size, im_scale, im_pad, exclude_nms=False)
+
+    def _inputs_def(self, image_shape, num_max_boxes):
+        im_shape = [None] + image_shape
+        # yapf: disable
+        inputs_def = {
+            'image':    {'shape': im_shape,                 'dtype': 'float32', 'lod_level': 0},
+            'im_size':  {'shape': [None, 2],                'dtype': 'int32',   'lod_level': 0},
+            'im_scale': {'shape': [None, 2],                'dtype': 'float32', 'lod_level': 0},
+            'im_pad':   {'shape': [None, 2],                'dtype': 'int32',   'lod_level': 0},
+            'im_id':    {'shape': [None, 1],                'dtype': 'int64',   'lod_level': 0},
+            'gt_bbox':  {'shape': [None, num_max_boxes, 4], 'dtype': 'float32', 'lod_level': 0},
+            'gt_class': {'shape': [None, num_max_boxes],    'dtype': 'int32',   'lod_level': 0},
+            'gt_score': {'shape': [None, num_max_boxes],    'dtype': 'float32', 'lod_level': 0},
+            'is_difficult': {'shape': [None, num_max_boxes],'dtype': 'int32',   'lod_level': 0},
+        }
+        # yapf: enable
+
+        if self.use_fine_grained_loss:
+            # yapf: disable
+            targets_def = {
+                'target0':  {'shape': [None, 3, 86, 19, 19],  'dtype': 'float32',   'lod_level': 0},
+                'target1':  {'shape': [None, 3, 86, 38, 38],  'dtype': 'float32',   'lod_level': 0},
+                'target2':  {'shape': [None, 3, 86, 76, 76],  'dtype': 'float32',   'lod_level': 0},
+            }
+            # yapf: enable
+
+            downsample = 32
+            for k, mask in zip(targets_def.keys(), self.yolo_head.anchor_masks):
+                targets_def[k]['shape'][1] = len(mask)
+                targets_def[k]['shape'][2] = 6 + self.yolo_head.num_classes
+                targets_def[k]['shape'][3] = image_shape[
+                    -2] // downsample if image_shape[-2] else None
+                targets_def[k]['shape'][4] = image_shape[
+                    -1] // downsample if image_shape[-1] else None
+                downsample //= 2
+
+            inputs_def.update(targets_def)
+
+        return inputs_def
+
+    def build_inputs(
+            self,
+            image_shape=[3, None, None],
+            fields=['image', 'gt_bbox', 'gt_class', 'gt_score'],  # for train
+            num_max_boxes=50,
+            use_dataloader=True,
+            iterable=False):
+        inputs_def = self._inputs_def(image_shape, num_max_boxes)
+
+        if 'im_size' not in fields and self.use_fine_grained_loss:
+            fields.extend(['target0', 'target1', 'target2'])
+        feed_vars = OrderedDict([(key, fluid.data(
+            name=key,
+            shape=inputs_def[key]['shape'],
+            dtype=inputs_def[key]['dtype'],
+            lod_level=inputs_def[key]['lod_level'])) for key in fields])
+        loader = fluid.io.DataLoader.from_generator(
+            feed_list=list(feed_vars.values()),
+            capacity=16,
+            use_double_buffer=True,
+            iterable=iterable) if use_dataloader else None
+        return feed_vars, loader
+
+    def train(self, feed_vars):
+        return self.build(feed_vars, mode='train')
+
+    def eval(self, feed_vars):
+        return self.build(feed_vars, mode='test')
+
+    def test(self, feed_vars, exclude_nms=False):
+        return self.build(feed_vars, mode='test', exclude_nms=False)
--- a/ppdet/modeling/backbones/__init__.py
+++ b/ppdet/modeling/backbones/__init__.py
@@ -35,6 +35,7 @@ from . import bifpn
 from . import cspdarknet
 from . import acfpn
 from . import ghostnet
+from . import cspyolo

 from .resnet import *
 from .resnext import *
@@ -57,3 +58,4 @@ from .bifpn import *
 from .cspdarknet import *
 from .acfpn import *
 from .ghostnet import *
+from .cspyolo import *
--- a/ppdet/modeling/backbones/cspyolo.py
+++ b/ppdet/modeling/backbones/cspyolo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import six
+import numpy as np
+from paddle import fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+
+__all__ = ['CSPYolo']
+
+
+def autopad(k, p):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+def make_divisible(x, divisor):
+    # Returns x evenly divisble by divisor
+    return math.ceil(x / divisor) * divisor
+
+
+@register
+class CSPYolo(object):
+
+    __shared__ = ['depth_multiple', 'width_multiple']
+
+    def __init__(self,
+                 layers=None,
+                 neck=None,
+                 depth_multiple=0.33,
+                 width_multiple=0.50,
+                 act='none',
+                 yolov5=True,
+                 save=[17, 20, 23],
+                 conv_decay=0.0,
+                 norm_type='bn',
+                 norm_decay=0.0,
+                 weight_prefix_name=''):
+
+        if layers is None:
+            self.layers = [
+                # [from, number, module, args, kwargs]
+                [-1, 1, 'Focus', [64, 3]],  # 0-P1/2
+                [-1, 1, 'Conv', [128, 3, 2]],  # 1-P2/4
+                [-1, 3, 'BottleneckCSP', [128]],
+                [-1, 1, 'Conv', [256, 3, 2]],  # 3-P3/8
+                [-1, 9, 'BottleneckCSP', [256]],
+                [-1, 1, 'Conv', [512, 3, 2]],  # 5-P4/16
+                [-1, 9, 'BottleneckCSP', [512]],
+                [-1, 1, 'Conv', [1024, 3, 2]],  # 7-P5/32
+                [-1, 1, 'SPP', [1024, [5, 9, 13]]],
+                [-1, 3, 'BottleneckCSP', [1024, False]],  # 9
+            ]
+        else:
+            self.layers = layers
+
+        if neck is None:
+            self.neck = [
+                [-1, 1, 'Conv', [512, 1, 1]],
+                [-1, 1, 'Upsample', [None, 2, 'nearest']],
+                [[-1, 6], 1, 'Concat', [1]],  # cat backbone P4
+                [-1, 3, 'BottleneckCSP', [512, False]],  # 13
+                [-1, 1, 'Conv', [256, 1, 1]],
+                [-1, 1, 'Upsample', [None, 2, 'nearest']],
+                [[-1, 4], 1, 'Concat', [1]],  # cat backbone P3
+                [-1, 3, 'BottleneckCSP', [256, False]],  # 17 (P3/8-small)
+                [-1, 1, 'Conv', [256, 3, 2]],
+                [[-1, 14], 1, 'Concat', [1]],  # cat head P4
+                [-1, 3, 'BottleneckCSP', [512, False]],  # 20 (P4/16-medium)
+                [-1, 1, 'Conv', [512, 3, 2]],
+                [[-1, 10], 1, 'Concat', [1]],  # cat head P5
+                [-1, 3, 'BottleneckCSP', [1024, False]],  # 23 (P5/32-large)
+            ]
+        else:
+            self.neck = neck
+
+        self.depth_multiple = depth_multiple
+        self.width_multiple = width_multiple
+        self.act = act
+        self.yolov5 = yolov5
+        self.save = save
+        self.conv_decay = conv_decay
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.weight_prefix_name = weight_prefix_name
+        self.layer_cfg = {
+            'Conv': self._conv,
+            'Focus': self._focus,
+            'Bottleneck': self._bottleneck,
+            'BottleneckCSP': self._bottleneckcsp,
+            'BottleneckCSP2': self._bottleneckcsp2,
+            'SPP': self._spp,
+            'SPPCSP': self._sppcsp,
+            'Upsample': self._upsample,
+            'Concat': self._concat
+        }
+        self.act_cfg = {
+            'relu': fluid.layers.relu,
+            'leaky_relu': lambda x: fluid.layers.leaky_relu(x, alpha=0.1),
+            'hard_swish': self._hard_swish,
+            'mish': self._mish,
+            'none': self._identity
+        }
+
+    def _identity(self, x):
+        return x
+
+    def _hard_swish(self, x):
+        return x * fluid.layers.relu6(x + 3) / 6.
+
+    def _softplus(self, x):
+        expf = fluid.layers.exp(fluid.layers.clip(x, -200, 50))
+        return fluid.layers.log(1 + expf)
+
+    def _mish(self, x):
+        return x * fluid.layers.tanh(self._softplus(x))
+
+    def _conv(self, x, c_out, k=1, s=1, p=None, g=1, act='none', name=None): 
+        x = fluid.layers.conv2d(x,
+                                c_out,
+                                k,
+                                stride=s,
+                                padding=autopad(k, p),
+                                groups=g,
+                                param_attr=ParamAttr(
+                                    regularizer=L2Decay(self.conv_decay),
+                                    name=name + '.conv.weight'),
+                                bias_attr=False)
+        x = self._bn(x, name=name)
+        x = self.act_cfg[act](x)
+        return x
+
+    def _bn(self, x, name=None):
+        param_attr = ParamAttr(regularizer=L2Decay(self.norm_decay),
+                               name=name + '.{}.weight'.format(self.norm_type))
+        bias_attr = ParamAttr(regularizer=L2Decay(self.norm_decay),
+                              name=name + '.{}.bias'.format(self.norm_type))
+
+        x = fluid.layers.batch_norm(
+            input=x,
+            epsilon=0.001,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            moving_mean_name=name + '.{}.running_mean'.format(self.norm_type),
+            moving_variance_name=name + '.{}.running_var'.format(self.norm_type))
+
+        return x
+
+    def _focus(self, x, c_out, k=1, s=1, p=None, g=1, act='none', name=None):
+        x = fluid.layers.concat([
+            x[:, :, 0::2, 0::2], x[:, :, 1::2, 0::2], x[:, :, 0::2, 1::2],
+            x[:, :, 1::2, 1::2]
+        ],
+                                axis=1)
+        x = self._conv(x, c_out, k, s, p, g, act, name + '.conv')
+        return x
+
+    def _bottleneck(self,
+                    x,
+                    c_out,
+                    shortcut=True,
+                    g=1,
+                    e=0.5,
+                    act='none',
+                    name=None):
+        c_h = int(c_out * e)
+        y = self._conv(x, c_h, 1, 1, act=act, name=name + '.cv1')
+        y = self._conv(y, c_out, 3, 1, g=g, act=act, name=name + '.cv2')
+        if shortcut:
+            y = fluid.layers.elementwise_add(x=x, y=y, act=None)
+        return y
+
+    def _bottleneckcsp(self,
+                       x,
+                       c_out,
+                       n=1,
+                       shortcut=True,
+                       g=1,
+                       e=0.5,
+                       act='none',
+                       name=None):
+        c_h = int(c_out * e)
+        # left branch
+        
+        y1 = self._conv(x, c_h, 1, 1, act=act, name=name + '.cv1')
+        # n bottle neck
+        bottleneck = self._bottleneck
+        for i in six.moves.xrange(n):
+            y1 = bottleneck(y1, c_h, shortcut, g, 1.0, act,
+                                  name + '.m.{}'.format(i))
+        y1 = fluid.layers.conv2d(
+            y1,
+            c_h,
+            1,
+            1,
+            param_attr=ParamAttr(regularizer=L2Decay(self.conv_decay),
+                                 name=name +
+                                 '.cv3.weight'),
+            bias_attr=False)
+        # right branch
+        y2 = fluid.layers.conv2d(x,
+                                 c_h,
+                                 1,
+                                 1,
+                                 param_attr=ParamAttr(
+                                     regularizer=L2Decay(self.conv_decay),
+                                     name=name + '.cv2.weight'),
+                                 bias_attr=False)
+        # concat
+        y = fluid.layers.concat([y1, y2], axis=1)
+        # bn + act
+        y = self._bn(y, name=name)
+        y = self.act_cfg['leaky_relu'](y) if self.yolov5 else self.act_cfg[act](
+            y)
+        # conv
+        y = self._conv(y, c_out, 1, 1, act=act, name=name + '.cv4')
+        return y
+
+    def _bottleneckcsp2(self,
+                        x,
+                        c_out,
+                        n=1,
+                        shortcut=False,
+                        g=1,
+                        e=1.0,
+                        act='none',
+                        name=None):
+        c_h = int(c_out)
+        x = self._conv(x, c_h, 1, 1, act=act, name=name + '.cv1')
+        # left_branch
+        y1 = x
+        for i in range(n):
+            y1 = self._bottleneck(y1, c_h, shortcut, g, 1.0, act,
+                                  name + '.m.{}'.format(i))
+        # right_branch
+        y2 = fluid.layers.conv2d(x,
+                                 c_h,
+                                 1,
+                                 1,
+                                 param_attr=ParamAttr(
+                                     regularizer=L2Decay(self.conv_decay),
+                                     name=name + '.cv2.weight'),
+                                 bias_attr=False)
+        # concat
+        y = fluid.layers.concat([y1, y2], axis=1)
+        # bn + act
+        y = self._bn(y, name=name)
+        y = self.act_cfg[act](y)
+        # conv
+        y = self._conv(y, c_out, 1, 1, act=act, name=name + '.cv3')
+        return y
+
+    def _spp(self, x, c_out, k=(5, 9, 13), act='none', name=None):
+        c_in = int(x.shape[1])
+        c_h = c_in // 2
+        # conv1
+        x = self._conv(x, c_h, 1, 1, act=act, name=name + '.cv1')
+        ys = [x]
+        # pooling
+        for s in k:
+            ys.append(fluid.layers.pool2d(x, s, 'max', 1, s // 2))
+        y = fluid.layers.concat(ys, axis=1)
+        # conv2
+        y = self._conv(y, c_out, 1, 1, act=act, name=name + '.cv2')
+        return y
+
+    def _sppcsp(self,
+                x,
+                c_out,
+                k=(5, 9, 13),
+                e=0.5,
+                act='none',
+                name=None):
+        c_h = int(2 * c_out * e)
+        # left branch
+        y1 = self._conv(x, c_h, 1, 1, act=act, name=name + '.cv1')
+        y1 = self._conv(y1, c_h, 3, 1, act=act, name=name + '.cv3')
+        y1 = self._conv(y1, c_h, 1, 1, act=act, name=name + '.cv4')
+        ys = [y1]
+        # pooling
+        for s in k:
+            ys.append(fluid.layers.pool2d(y1, s, 'max', 1, s // 2))
+        y1 = fluid.layers.concat(ys, axis=1)
+
+        y1 = self._conv(y1, c_h, 1, 1, act=act, name=name + '.cv5')
+        y1 = self._conv(y1, c_h, 3, 1, act=act, name=name + '.cv6')
+        # right_branch
+        y2 = fluid.layers.conv2d(x,
+                                 c_h,
+                                 1,
+                                 1,
+                                 param_attr=ParamAttr(
+                                     regularizer=L2Decay(self.conv_decay),
+                                     name=name + '.cv2.weight'),
+                                 bias_attr=False)
+        # concat
+        y = fluid.layers.concat([y1, y2], axis=1)
+        y = self._bn(y, name=name)
+        y = self.act_cfg[act](y)
+        y = self._conv(y, c_out, 1, 1, act=act, name=name + '.cv7')
+        return y
+
+    def _upsample(self, x, out_shape, scale, method, name=None):
+        out_shape = None if out_shape == 'None' else out_shape
+        if name == 'bilinear':
+            return fluid.layers.resize_bilinear(x, out_shape, scale, name=name)
+        if name == 'trilinear':
+            return fluid.layers.resize_trilinear(x, out_shape, scale, name=name)
+        return fluid.layers.resize_nearest(x, out_shape, scale, name=name)
+
+    def _concat(self, x, axis, name=None):
+        y = fluid.layers.concat(x, axis, name=name)
+        return y
+
+    def Print(self, x):
+        fluid.layers.Print(fluid.layers.reduce_max(x))
+        fluid.layers.Print(fluid.layers.reduce_min(x))
+        fluid.layers.Print(fluid.layers.reduce_mean(x))
+        fluid.layers.Print(fluid.layers.reduce_mean(fluid.layers.abs(x)))
+        
+    def __call__(self, x):
+        prefix = self.weight_prefix_name
+        gw, gd = self.width_multiple, self.depth_multiple
+        layers, outputs = [], []
+        for i, (f, n, m, args) in enumerate(self.layers + self.neck):
+            if i == 0:
+                inputs = x
+            else:
+                if isinstance(f, int):
+                    inputs = layers[f]
+                else:
+                    inputs = [layers[idx] for idx in f]
+            n = max(round(n * gd), 1) if n > 1 else n
+            if m in [
+                    'Conv', 'Bottleneck', 'BottleneckCSP', 'BottleneckCSP2',
+                    'SPP', 'SPPCSP', 'Focus'
+            ]:
+                c_out = args[0]
+                args[0] = make_divisible(c_out * gw, 8)
+                if m in ['BottleneckCSP', 'BottleneckCSP2']:
+                    args.insert(1, n)
+
+            if m in ['Upsample', 'Concat']:
+                layers.append(self.layer_cfg[m](inputs,
+                                            *args,
+                                            name=prefix + '.{}'.format(i)))
+            else:
+                layers.append(self.layer_cfg[m](inputs,
+                                            *args,
+                                            act=self.act,
+                                            name=prefix + '.{}'.format(i)))
+            if i in self.save:
+                outputs.append(layers[i])
+        return outputs
--- a/ppdet/utils/checkpoint.py
+++ b/ppdet/utils/checkpoint.py
@@ -146,7 +146,8 @@ def load_params(exe, prog, path, ignore_params=[]):
    if len(ignore_set) > 0:
        for k in ignore_set:
            if k in state:
-                logger.warning('variable {} not used'.format(k))
+                logger.warning('variable {}: state shape {}, param shape {}'.format(k, state[k].shape, all_var_shape[k]))
+                # logger.warning('variable {} not used'.format(k))
                del state[k]
    fluid.io.set_program_state(prog, state)


--- a/tools/eval.py
+++ b/tools/eval.py
@@ -137,6 +137,7 @@ def main():

    # load model
    exe.run(startup_prog)
+    checkpoint.save(exe, startup_prog, 'weights/initial')
    if 'weights' in cfg:
        checkpoint.load_params(exe, startup_prog, cfg.weights)