diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b793bb23fc183f1fdb4b1fe1b2a9b72c1b6bf80f..84b4677777a79b30ba8936025a60e8d6d9186a2c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -324,7 +324,7 @@ paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index d3a61dc367c3642b8faa9085a470a302712395e5..cace42bc1bae93287c330e54d12126efbf9a14bb 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
+detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 
 if(WITH_GPU)
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
similarity index 69%
rename from paddle/fluid/operators/yolov3_loss_op.cc
rename to paddle/fluid/operators/detection/yolov3_loss_op.cc
index 60508f7ab871910c38f1e4aa04c2035075d37df5..2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba 100644
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -9,7 +9,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/yolov3_loss_op.h"
+#include "paddle/fluid/operators/detection/yolov3_loss_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -29,23 +29,33 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
                    "Input(GTLabel) of Yolov3LossOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Loss"),
                    "Output(Loss) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ObjectnessMask"),
+        "Output(ObjectnessMask) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"),
+                   "Output(GTMatchMask) of Yolov3LossOp should not be null.");
 
     auto dim_x = ctx->GetInputDim("X");
     auto dim_gtbox = ctx->GetInputDim("GTBox");
     auto dim_gtlabel = ctx->GetInputDim("GTLabel");
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto anchor_mask = ctx->Attrs().Get<std::vector<int>>("anchor_mask");
+    int mask_num = anchor_mask.size();
     auto class_num = ctx->Attrs().Get<int>("class_num");
+
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
     PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
                       "Input(X) dim[3] and dim[4] should be euqal.");
-    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
-                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
-                      "+ class_num)).");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], mask_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
     PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
                       "Input(GTBox) should be a 3-D tensor");
     PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
     PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
-                      "Input(GTBox) should be a 2-D tensor");
+                      "Input(GTLabel) should be a 2-D tensor");
     PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
                       "Input(GTBox) and Input(GTLabel) dim[0] should be same");
     PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
@@ -54,11 +64,22 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
                       "Attr(anchors) length should be greater then 0.");
     PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
                       "Attr(anchors) length should be even integer.");
+    for (size_t i = 0; i < anchor_mask.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          anchor_mask[i], anchor_num,
+          "Attr(anchor_mask) should not crossover Attr(anchors).");
+    }
     PADDLE_ENFORCE_GT(class_num, 0,
                       "Attr(class_num) should be an integer greater then 0.");
 
-    std::vector<int64_t> dim_out({1});
+    std::vector<int64_t> dim_out({dim_x[0]});
     ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
+
+    std::vector<int64_t> dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]});
+    ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask));
+
+    std::vector<int64_t> dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]});
+    ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask));
   }
 
  protected:
@@ -73,11 +94,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of YOLO v3 loss operator, "
+             "The input tensor of YOLOv3 loss operator, "
              "This is a 4-D tensor with shape of [N, C, H, W]."
              "H and W should be same, and the second dimention(C) stores"
              "box locations, confidence score and classification one-hot"
-             "key of each anchor box");
+             "keys of each anchor box");
     AddInput("GTBox",
              "The input tensor of ground truth boxes, "
              "This is a 3-D tensor with shape of [N, max_box_num, 5], "
@@ -89,32 +110,39 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("GTLabel",
              "The input tensor of ground truth label, "
              "This is a 2-D tensor with shape of [N, max_box_num], "
-             "and each element shoudl be an integer to indicate the "
+             "and each element should be an integer to indicate the "
              "box class id.");
     AddOutput("Loss",
               "The output yolov3 loss tensor, "
-              "This is a 1-D tensor with shape of [1]");
+              "This is a 1-D tensor with shape of [N]");
+    AddOutput("ObjectnessMask",
+              "This is an intermediate tensor with shape of [N, M, H, W], "
+              "M is the number of anchor masks. This parameter caches the "
+              "mask for calculate objectness loss in gradient kernel.")
+        .AsIntermediate();
+    AddOutput("GTMatchMask",
+              "This is an intermediate tensor with shape of [N, B], "
+              "B is the max box number of GT boxes. This parameter caches "
+              "matched mask index of each GT boxes for gradient calculate.")
+        .AsIntermediate();
 
     AddAttr<int>("class_num", "The number of classes to predict.");
     AddAttr<std::vector<int>>("anchors",
                               "The anchor width and height, "
-                              "it will be parsed pair by pair.");
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<std::vector<int>>("anchor_mask",
+                              "The mask index of anchors used in "
+                              "current YOLOv3 loss calculation.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YOLOv3 loss "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YOLOv3 loss operators.")
+        .SetDefault(32);
     AddAttr<float>("ignore_thresh",
-                   "The ignore threshold to ignore confidence loss.");
-    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>(
-        "loss_weight_conf_target",
-        "The weight of confidence score loss in locations with target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_conf_notarget",
-                   "The weight of confidence score loss in locations without "
-                   "target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
-        .SetDefault(1.0);
+                   "The ignore threshold to ignore confidence loss.")
+        .SetDefault(0.7);
     AddComment(R"DOC(
          This operator generate yolov3 loss by given predict result and ground
          truth boxes.
@@ -147,17 +175,28 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The MSE loss is used for 
-         box location, and binary cross entropy loss is used for confidence score 
-         loss and classification loss.
+         confidence score loss, and classification loss. The L2 loss is used for 
+         box coordinates (w, h), and sigmoid cross entropy loss is used for box 
+         coordinates (x, y), confidence score loss and classification loss.
+
+         Each groud truth box find a best matching anchor box in all anchors, 
+         prediction of this anchor box will incur all three parts of losses, and
+         prediction of anchor boxes with no GT box matched will only incur objectness
+         loss.
+
+         In order to trade off box coordinate losses between big boxes and small 
+         boxes, box coordinate losses will be mutiplied by scale weight, which is
+         calculated as follow.
+
+         $$
+         weight_{box} = 2.0 - t_w * t_h
+         $$
 
          Final loss will be represented as follow.
 
          $$
-         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
-              + \loss_weight_{conf_target} * loss_{conf_target}
-              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
-              + \loss_weight_{class} * loss_{class}
+         loss = (loss_{xy} + loss_{wh}) * weight_{box}
+              + loss_{conf} + loss_{class}
          $$
          )DOC");
   }
@@ -196,6 +235,8 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("GTBox", Input("GTBox"));
     op->SetInput("GTLabel", Input("GTLabel"));
     op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
+    op->SetInput("GTMatchMask", Output("GTMatchMask"));
 
     op->SetAttrMap(Attrs());
 
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8407d4e6e8f87a2e8d073c4fbda5691abe1bba68
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -0,0 +1,447 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+static inline bool LessEqualZero(T x) {
+  return x < 1e-6;
+}
+
+template <typename T>
+static T SigmoidCrossEntropy(T x, T label) {
+  return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x)));
+}
+
+template <typename T>
+static T L2Loss(T x, T y) {
+  return 0.5 * (y - x) * (y - x);
+}
+
+template <typename T>
+static T SigmoidCrossEntropyGrad(T x, T label) {
+  return 1.0 / (1.0 + std::exp(-x)) - label;
+}
+
+template <typename T>
+static T L2LossGrad(T x, T y) {
+  return x - y;
+}
+
+static int GetMaskIndex(std::vector<int> mask, int val) {
+  for (size_t i = 0; i < mask.size(); i++) {
+    if (mask[i] == val) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+template <typename T>
+struct Box {
+  T x, y, w, h;
+};
+
+template <typename T>
+static inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
+                                int j, int an_idx, int grid_size,
+                                int input_size, int index, int stride) {
+  Box<T> b;
+  b.x = (i + sigmoid<T>(x[index])) / grid_size;
+  b.y = (j + sigmoid<T>(x[index + stride])) / grid_size;
+  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size;
+  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size;
+  return b;
+}
+
+template <typename T>
+static inline Box<T> GetGtBox(const T* gt, int batch, int max_boxes, int idx) {
+  Box<T> b;
+  b.x = gt[(batch * max_boxes + idx) * 4];
+  b.y = gt[(batch * max_boxes + idx) * 4 + 1];
+  b.w = gt[(batch * max_boxes + idx) * 4 + 2];
+  b.h = gt[(batch * max_boxes + idx) * 4 + 3];
+  return b;
+}
+
+template <typename T>
+static inline T BoxOverlap(T c1, T w1, T c2, T w2) {
+  T l1 = c1 - w1 / 2.0;
+  T l2 = c2 - w2 / 2.0;
+  T left = l1 > l2 ? l1 : l2;
+  T r1 = c1 + w1 / 2.0;
+  T r2 = c2 + w2 / 2.0;
+  T right = r1 < r2 ? r1 : r2;
+  return right - left;
+}
+
+template <typename T>
+static inline T CalcBoxIoU(Box<T> b1, Box<T> b2) {
+  T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w);
+  T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h);
+  T inter_area = (w < 0 || h < 0) ? 0.0 : w * h;
+  T union_area = b1.w * b1.h + b2.w * b2.h - inter_area;
+  return inter_area / union_area;
+}
+
+static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                int an_stride, int stride, int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
+                                std::vector<int> anchors, int an_idx,
+                                int box_idx, int gi, int gj, int grid_size,
+                                int input_size, int stride) {
+  T tx = gt.x * grid_size - gi;
+  T ty = gt.y * grid_size - gj;
+  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
+  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
+
+  T scale = (2.0 - gt.w * gt.h);
+  loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
+  loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
+  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+}
+
+template <typename T>
+static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
+                                    Box<T> gt, std::vector<int> anchors,
+                                    int an_idx, int box_idx, int gi, int gj,
+                                    int grid_size, int input_size, int stride) {
+  T tx = gt.x * grid_size - gi;
+  T ty = gt.y * grid_size - gj;
+  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
+  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
+
+  T scale = (2.0 - gt.w * gt.h);
+  input_grad[box_idx] =
+      SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
+  input_grad[box_idx + stride] =
+      SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
+  input_grad[box_idx + 2 * stride] =
+      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+  input_grad[box_idx + 3 * stride] =
+      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+}
+
+template <typename T>
+static inline void CalcLabelLoss(T* loss, const T* input, const int index,
+                                 const int label, const int class_num,
+                                 const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    T pred = input[index + i * stride];
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+  }
+}
+
+template <typename T>
+static inline void CalcLabelLossGrad(T* input_grad, const T loss,
+                                     const T* input, const int index,
+                                     const int label, const int class_num,
+                                     const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    T pred = input[index + i * stride];
+    input_grad[index + i * stride] =
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+  }
+}
+
+template <typename T>
+static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
+                                   const int n, const int an_num, const int h,
+                                   const int w, const int stride,
+                                   const int an_stride) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          T obj = objness[k * w + l];
+          if (obj > 1e-5) {
+            // positive sample: obj = 1
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+          } else if (obj > -0.5) {
+            // negetive sample: obj = 0
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
+          }
+        }
+      }
+      objness += stride;
+      input += an_stride;
+    }
+  }
+}
+
+template <typename T>
+static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
+                                       const T* input, const T* objness,
+                                       const int n, const int an_num,
+                                       const int h, const int w,
+                                       const int stride, const int an_stride) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          T obj = objness[k * w + l];
+          if (obj > 1e-5) {
+            input_grad[k * w + l] =
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+          } else if (obj > -0.5) {
+            input_grad[k * w + l] =
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
+          }
+        }
+      }
+      objness += stride;
+      input += an_stride;
+      input_grad += an_stride;
+    }
+  }
+}
+
+template <typename T>
+static void inline GtValid(bool* valid, const T* gtbox, const int n,
+                           const int b) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < b; j++) {
+      if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) {
+        valid[j] = false;
+      } else {
+        valid[j] = true;
+      }
+    }
+    valid += b;
+    gtbox += b * 4;
+  }
+}
+
+template <typename T>
+class Yolov3LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
+    auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+    const int mask_num = anchor_mask.size();
+    const int b = gt_box->dims()[1];
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    const T* gt_box_data = gt_box->data<T>();
+    const int* gt_label_data = gt_label->data<int>();
+    T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
+    memset(loss_data, 0, loss->numel() * sizeof(T));
+    T* obj_mask_data =
+        objness_mask->mutable_data<T>({n, mask_num, h, w}, ctx.GetPlace());
+    memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T));
+    int* gt_match_mask_data =
+        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
+
+    // calc valid gt box mask, avoid calc duplicately in following code
+    Tensor gt_valid_mask;
+    bool* gt_valid_mask_data =
+        gt_valid_mask.mutable_data<bool>({n, b}, ctx.GetPlace());
+    GtValid<T>(gt_valid_mask_data, gt_box_data, n, b);
+
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < mask_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            // each predict box find a best match gt box, if overlap is bigger
+            // then ignore_thresh, ignore the objectness loss.
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0);
+            Box<T> pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j],
+                                     h, input_size, box_idx, stride);
+            T best_iou = 0;
+            for (int t = 0; t < b; t++) {
+              if (!gt_valid_mask_data[i * b + t]) {
+                continue;
+              }
+              Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+              T iou = CalcBoxIoU(pred, gt);
+              if (iou > best_iou) {
+                best_iou = iou;
+              }
+            }
+
+            // If best IoU is bigger then ignore_thresh,
+            // ignore the objectness loss.
+            if (best_iou > ignore_thresh) {
+              int obj_idx = (i * mask_num + j) * stride + k * w + l;
+              obj_mask_data[obj_idx] = static_cast<T>(-1);
+            }
+            // all losses should be calculated if best IoU
+            // is bigger then truth thresh, but currently,
+            // truth thresh is an unreachable value as 1.0.
+          }
+        }
+      }
+      for (int t = 0; t < b; t++) {
+        if (!gt_valid_mask_data[i * b + t]) {
+          gt_match_mask_data[i * b + t] = -1;
+          continue;
+        }
+        Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+        int gi = static_cast<int>(gt.x * w);
+        int gj = static_cast<int>(gt.y * h);
+        Box<T> gt_shift = gt;
+        gt_shift.x = 0.0;
+        gt_shift.y = 0.0;
+        T best_iou = 0.0;
+        int best_n = 0;
+        // each gt box find a best match anchor box as positive sample,
+        // for positive sample, all losses should be calculated, and for
+        // other samples, only objectness loss is required.
+        for (int an_idx = 0; an_idx < an_num; an_idx++) {
+          Box<T> an_box;
+          an_box.x = 0.0;
+          an_box.y = 0.0;
+          an_box.w = anchors[2 * an_idx] / static_cast<T>(input_size);
+          an_box.h = anchors[2 * an_idx + 1] / static_cast<T>(input_size);
+          float iou = CalcBoxIoU<T>(an_box, gt_shift);
+          if (iou > best_iou) {
+            best_iou = iou;
+            best_n = an_idx;
+          }
+        }
+
+        int mask_idx = GetMaskIndex(anchor_mask, best_n);
+        gt_match_mask_data[i * b + t] = mask_idx;
+        if (mask_idx >= 0) {
+          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                      an_stride, stride, 0);
+          CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
+                                 box_idx, gi, gj, h, input_size, stride);
+
+          int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
+          obj_mask_data[obj_idx] = 1.0;
+
+          int label = gt_label_data[i * b + t];
+          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                        an_stride, stride, 5);
+          CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
+                           class_num, stride);
+        }
+      }
+    }
+
+    CalcObjnessLoss<T>(loss_data, input_data + 4 * stride, obj_mask_data, n,
+                       mask_num, h, w, stride, an_stride);
+  }
+};
+
+template <typename T>
+class Yolov3LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
+    auto* gt_match_mask = ctx.Input<Tensor>("GTMatchMask");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
+    int class_num = ctx.Attr<int>("class_num");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input_grad->dims()[0];
+    const int c = input_grad->dims()[1];
+    const int h = input_grad->dims()[2];
+    const int w = input_grad->dims()[3];
+    const int mask_num = anchor_mask.size();
+    const int b = gt_match_mask->dims()[1];
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    const T* gt_box_data = gt_box->data<T>();
+    const int* gt_label_data = gt_label->data<int>();
+    const T* loss_grad_data = loss_grad->data<T>();
+    const T* obj_mask_data = objness_mask->data<T>();
+    const int* gt_match_mask_data = gt_match_mask->data<int>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
+
+    for (int i = 0; i < n; i++) {
+      for (int t = 0; t < b; t++) {
+        int mask_idx = gt_match_mask_data[i * b + t];
+        if (mask_idx >= 0) {
+          Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+          int gi = static_cast<int>(gt.x * w);
+          int gj = static_cast<int>(gt.y * h);
+
+          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                      an_stride, stride, 0);
+          CalcBoxLocationLossGrad<T>(
+              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
+              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+
+          int label = gt_label_data[i * b + t];
+          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                        an_stride, stride, 5);
+          CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
+                               label_idx, label, class_num, stride);
+        }
+      }
+    }
+
+    CalcObjnessLossGrad<T>(input_grad_data + 4 * stride, loss_grad_data,
+                           input_data + 4 * stride, obj_mask_data, n, mask_num,
+                           h, w, stride, an_stride);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h
deleted file mode 100644
index 0bb285722ddedf721d98237760ec9868e2134442..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Array5 = Eigen::DSizes<int64_t, 5>;
-
-template <typename T>
-static inline bool isZero(T x) {
-  return fabs(x) < 1e-6;
-}
-
-template <typename T>
-static inline T sigmoid(T x) {
-  return 1.0 / (exp(-1.0 * x) + 1.0);
-}
-
-template <typename T>
-static inline T CalcMaskPointNum(const Tensor& mask) {
-  auto mask_t = EigenVector<int>::Flatten(mask);
-  T count = 0.0;
-  for (int i = 0; i < mask_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      count += 1.0;
-    }
-  }
-  return count;
-}
-
-template <typename T>
-static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
-                                const Tensor& mask) {
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  T error_sum = 0.0;
-  T points = 0.0;
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      error_sum += pow(x_t(i) - y_t(i), 2);
-      points += 1;
-    }
-  }
-  return (error_sum / points);
-}
-
-template <typename T>
-static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
-                                const Tensor& mask, T mf) {
-  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
-    }
-  }
-}
-
-template <typename T>
-static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
-                                const Tensor& mask) {
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  T error_sum = 0.0;
-  T points = 0.0;
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      error_sum +=
-          -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
-      points += 1;
-    }
-  }
-  return (error_sum / points);
-}
-
-template <typename T>
-static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
-                                       const Tensor& y, const Tensor& mask,
-                                       T mf) {
-  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
-    }
-  }
-}
-
-template <typename T>
-static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
-                           Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
-                           Tensor* pred_w, Tensor* pred_h, const int anchor_num,
-                           const int class_num) {
-  const int n = input.dims()[0];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
-  const int box_attr_num = 5 + class_num;
-
-  auto input_t = EigenTensor<T, 4>::From(input);
-  auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
-  auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
-  auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
-  auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
-
-  for (int i = 0; i < n; i++) {
-    for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-      for (int j = 0; j < h; j++) {
-        for (int k = 0; k < w; k++) {
-          pred_x_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx, j, k));
-          pred_y_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
-          pred_w_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 2, j, k);
-          pred_h_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 3, j, k);
-
-          pred_conf_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
-
-          for (int c = 0; c < class_num; c++) {
-            pred_class_t(i, an_idx, j, k, c) =
-                sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
-  T b1_x1 = box1[0] - box1[2] / 2;
-  T b1_x2 = box1[0] + box1[2] / 2;
-  T b1_y1 = box1[1] - box1[3] / 2;
-  T b1_y2 = box1[1] + box1[3] / 2;
-  T b2_x1 = box2[0] - box2[2] / 2;
-  T b2_x2 = box2[0] + box2[2] / 2;
-  T b2_y1 = box2[1] - box2[3] / 2;
-  T b2_y2 = box2[1] + box2[3] / 2;
-
-  T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
-  T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
-
-  T inter_rect_x1 = std::max(b1_x1, b2_x1);
-  T inter_rect_y1 = std::max(b1_y1, b2_y1);
-  T inter_rect_x2 = std::min(b1_x2, b2_x2);
-  T inter_rect_y2 = std::min(b1_y2, b2_y2);
-  T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
-                 std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
-
-  return inter_area / (b1_area + b2_area - inter_area);
-}
-
-template <typename T>
-static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
-                            const float ignore_thresh, std::vector<int> anchors,
-                            const int grid_size, Tensor* obj_mask,
-                            Tensor* noobj_mask, Tensor* tx, Tensor* ty,
-                            Tensor* tw, Tensor* th, Tensor* tconf,
-                            Tensor* tclass) {
-  const int n = gt_box.dims()[0];
-  const int b = gt_box.dims()[1];
-  const int anchor_num = anchors.size() / 2;
-  auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
-  auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
-  auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
-  auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
-  auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
-  auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
-  auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
-  auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
-  auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
-  auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
-
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < b; j++) {
-      if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
-          isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
-        continue;
-      }
-
-      int cur_label = gt_label_t(i, j);
-      T gx = gt_box_t(i, j, 0) * grid_size;
-      T gy = gt_box_t(i, j, 1) * grid_size;
-      T gw = gt_box_t(i, j, 2) * grid_size;
-      T gh = gt_box_t(i, j, 3) * grid_size;
-      int gi = static_cast<int>(gx);
-      int gj = static_cast<int>(gy);
-
-      T max_iou = static_cast<T>(0);
-      T iou;
-      int best_an_index = -1;
-      std::vector<T> gt_box_shape({0, 0, gw, gh});
-      for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-        std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
-                                     static_cast<T>(anchors[2 * an_idx + 1])});
-        iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
-        if (iou > max_iou) {
-          max_iou = iou;
-          best_an_index = an_idx;
-        }
-        if (iou > ignore_thresh) {
-          noobj_mask_t(i, an_idx, gj, gi) = 0;
-        }
-      }
-      obj_mask_t(i, best_an_index, gj, gi) = 1;
-      noobj_mask_t(i, best_an_index, gj, gi) = 0;
-      tx_t(i, best_an_index, gj, gi) = gx - gi;
-      ty_t(i, best_an_index, gj, gi) = gy - gj;
-      tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
-      th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
-      tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
-      tconf_t(i, best_an_index, gj, gi) = 1;
-    }
-  }
-}
-
-static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
-                                    const Tensor& obj_mask) {
-  const int n = obj_mask_expand->dims()[0];
-  const int an_num = obj_mask_expand->dims()[1];
-  const int h = obj_mask_expand->dims()[2];
-  const int w = obj_mask_expand->dims()[3];
-  const int class_num = obj_mask_expand->dims()[4];
-  auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
-  auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
-
-  obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
-                          .broadcast(Array5(1, 1, 1, 1, class_num));
-}
-
-template <typename T>
-static void AddAllGradToInputGrad(
-    Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
-    const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
-    const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
-    const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
-    const Tensor& grad_class, const int class_num, const float loss_weight_xy,
-    const float loss_weight_wh, const float loss_weight_conf_target,
-    const float loss_weight_conf_notarget, const float loss_weight_class) {
-  const int n = pred_x.dims()[0];
-  const int an_num = pred_x.dims()[1];
-  const int h = pred_x.dims()[2];
-  const int w = pred_x.dims()[3];
-  const int attr_num = class_num + 5;
-  auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
-  auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
-  auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
-  auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
-  auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
-  auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
-  auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
-  auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
-  auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
-  auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
-
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          grad_t(i, j * attr_num, k, l) =
-              grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
-              (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
-          grad_t(i, j * attr_num + 1, k, l) =
-              grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
-              (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
-          grad_t(i, j * attr_num + 2, k, l) =
-              grad_w_t(i, j, k, l) * loss * loss_weight_wh;
-          grad_t(i, j * attr_num + 3, k, l) =
-              grad_h_t(i, j, k, l) * loss * loss_weight_wh;
-          grad_t(i, j * attr_num + 4, k, l) =
-              grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
-              (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
-          grad_t(i, j * attr_num + 4, k, l) +=
-              grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
-              (1.0 - pred_conf_t(i, j, k, l)) * loss *
-              loss_weight_conf_notarget;
-
-          for (int c = 0; c < class_num; c++) {
-            grad_t(i, j * attr_num + 5 + c, k, l) =
-                grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
-                (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-class Yolov3LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                      &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
-    Tensor tx, ty, tw, th, tconf, tclass;
-    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
-                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
-                                      ctx.GetPlace());
-    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
-
-    T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
-    T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
-    T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
-    T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
-    T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
-    T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
-    T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
-
-    auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
-    loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
-                   loss_weight_wh * (loss_w + loss_h) +
-                   loss_weight_conf_target * loss_conf_target +
-                   loss_weight_conf_notarget * loss_conf_notarget +
-                   loss_weight_class * loss_class;
-  }
-};
-
-template <typename T>
-class Yolov3LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    const T loss = output_grad->data<T>()[0];
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
-
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                      &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
-    Tensor tx, ty, tw, th, tconf, tclass;
-    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
-                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
-                                      ctx.GetPlace());
-    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
-
-    Tensor grad_x, grad_y, grad_w, grad_h;
-    Tensor grad_conf_target, grad_conf_notarget, grad_class;
-    grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    T obj_mf = CalcMaskPointNum<int>(obj_mask);
-    T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
-    T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
-    CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
-    CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
-                           obj_mf);
-    CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
-                           noobj_mf);
-    CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
-                           obj_expand_mf);
-
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    AddAllGradToInputGrad<T>(
-        input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
-        grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
-        class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
-        loss_weight_conf_notarget, loss_weight_class);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index bf2f587e67099699e0f987ca248cf682ab0c22ce..0602d7a19481fbf0210a7cb4bd15a1033b0e8900 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -508,13 +508,10 @@ def yolov3_loss(x,
                 gtbox,
                 gtlabel,
                 anchors,
+                anchor_mask,
                 class_num,
                 ignore_thresh,
-                loss_weight_xy=None,
-                loss_weight_wh=None,
-                loss_weight_conf_target=None,
-                loss_weight_conf_notarget=None,
-                loss_weight_class=None,
+                downsample_ratio,
                 name=None):
     """
     ${comment}
@@ -526,16 +523,13 @@ def yolov3_loss(x,
                           and x, y, w, h should be relative value of input image.
                           N is the batch number and B is the max box number in 
                           an image.
-        gtlabel (Variable): class id of ground truth boxes, shoud be ins shape
+        gtlabel (Variable): class id of ground truth boxes, shoud be in shape
                             of [N, B].
         anchors (list|tuple): ${anchors_comment}
+        anchor_mask (list|tuple): ${anchor_mask_comment}
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
-        loss_weight_xy (float|None): ${loss_weight_xy_comment}
-        loss_weight_wh (float|None): ${loss_weight_wh_comment}
-        loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
-        loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
-        loss_weight_class (float|None): ${loss_weight_class_comment}
+        downsample_ratio (int): ${downsample_ratio_comment}
         name (string): the name of yolov3 loss
 
     Returns:
@@ -555,9 +549,10 @@ def yolov3_loss(x,
         x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
         gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
         gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80
-                                        anchors=anchors, ignore_thresh=0.5)
+        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+        anchors = [0, 1, 2]
+        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
+                                        ignore_thresh=0.5, downsample_ratio=32)
     """
     helper = LayerHelper('yolov3_loss', **locals())
 
@@ -569,6 +564,8 @@ def yolov3_loss(x,
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
+        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
     if not isinstance(class_num, int):
         raise TypeError("Attr class_num of yolov3_loss must be an integer")
     if not isinstance(ignore_thresh, float):
@@ -581,31 +578,29 @@ def yolov3_loss(x,
         loss = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
 
+    objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
+    gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
+
     attrs = {
         "anchors": anchors,
+        "anchor_mask": anchor_mask,
         "class_num": class_num,
         "ignore_thresh": ignore_thresh,
+        "downsample_ratio": downsample_ratio,
     }
 
-    if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
-        self.attrs['loss_weight_xy'] = loss_weight_xy
-    if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
-        self.attrs['loss_weight_wh'] = loss_weight_wh
-    if loss_weight_conf_target is not None and isinstance(
-            loss_weight_conf_target, float):
-        self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
-    if loss_weight_conf_notarget is not None and isinstance(
-            loss_weight_conf_notarget, float):
-        self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
-    if loss_weight_class is not None and isinstance(loss_weight_class, float):
-        self.attrs['loss_weight_class'] = loss_weight_class
-
     helper.append_op(
         type='yolov3_loss',
-        inputs={"X": x,
-                "GTBox": gtbox,
-                "GTLabel": gtlabel},
-        outputs={'Loss': loss},
+        inputs={
+            "X": x,
+            "GTBox": gtbox,
+            "GTLabel": gtlabel,
+        },
+        outputs={
+            'Loss': loss,
+            'ObjectnessMask': objectness_mask,
+            'GTMatchMask': gt_match_mask
+        },
         attrs=attrs)
     return loss
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 3e71584a5d1b1a42b2b916d2df33a55c34557046..77dfa1cb519db3faa9ef8b7b27f7a39b5d31f2a8 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -476,8 +476,8 @@ class TestYoloDetection(unittest.TestCase):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10,
-                                      0.5)
+            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
+                                      [0, 1], 10, 0.7, 32)
 
             self.assertIsNotNone(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 544fe4b4f81909b69a05d9751316e3d3137fdc45..020c1139230a9177c4d7765367359d91839d7d46 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -16,174 +16,179 @@ from __future__ import division
 
 import unittest
 import numpy as np
+from scipy.special import logit
+from scipy.special import expit
 from op_test import OpTest
 
 from paddle.fluid import core
 
 
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
+def l2loss(x, y):
+    return 0.5 * (y - x) * (y - x)
 
 
-def mse(x, y, num):
-    return ((y - x)**2).sum() / num
+def sce(x, label):
+    sigmoid_x = expit(x)
+    term1 = label * np.log(sigmoid_x)
+    term2 = (1.0 - label) * np.log(1.0 - sigmoid_x)
+    return -term1 - term2
 
 
-def bce(x, y, mask):
-    x = x.reshape((-1))
-    y = y.reshape((-1))
-    mask = mask.reshape((-1))
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
 
-    error_sum = 0.0
-    count = 0
-    for i in range(x.shape[0]):
-        if mask[i] > 0:
-            error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
-            count += 1
-    return error_sum / (-1.0 * count)
 
+def batch_xywh_box_iou(box1, box2):
+    b1_left = box1[:, :, 0] - box1[:, :, 2] / 2
+    b1_right = box1[:, :, 0] + box1[:, :, 2] / 2
+    b1_top = box1[:, :, 1] - box1[:, :, 3] / 2
+    b1_bottom = box1[:, :, 1] + box1[:, :, 3] / 2
 
-def box_iou(box1, box2):
-    b1_x1 = box1[0] - box1[2] / 2
-    b1_x2 = box1[0] + box1[2] / 2
-    b1_y1 = box1[1] - box1[3] / 2
-    b1_y2 = box1[1] + box1[3] / 2
-    b2_x1 = box2[0] - box2[2] / 2
-    b2_x2 = box2[0] + box2[2] / 2
-    b2_y1 = box2[1] - box2[3] / 2
-    b2_y2 = box2[1] + box2[3] / 2
+    b2_left = box2[:, :, 0] - box2[:, :, 2] / 2
+    b2_right = box2[:, :, 0] + box2[:, :, 2] / 2
+    b2_top = box2[:, :, 1] - box2[:, :, 3] / 2
+    b2_bottom = box2[:, :, 1] + box2[:, :, 3] / 2
 
-    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
-    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+    left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :])
+    right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :])
+    top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :])
+    bottom = np.minimum(b1_bottom[:, :, np.newaxis],
+                        b2_bottom[:, np.newaxis, :])
 
-    inter_rect_x1 = max(b1_x1, b2_x1)
-    inter_rect_y1 = max(b1_y1, b2_y1)
-    inter_rect_x2 = min(b1_x2, b2_x2)
-    inter_rect_y2 = min(b1_y2, b2_y2)
-    inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
-        inter_rect_y2 - inter_rect_y1, 0)
+    inter_w = np.clip(right - left, 0., 1.)
+    inter_h = np.clip(bottom - top, 0., 1.)
+    inter_area = inter_w * inter_h
 
-    return inter_area / (b1_area + b2_area + inter_area)
+    b1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
+    b2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
+    union = b1_area[:, :, np.newaxis] + b2_area[:, np.newaxis, :] - inter_area
 
+    return inter_area / union
 
-def build_target(gtboxs, gtlabel, attrs, grid_size):
-    n, b, _ = gtboxs.shape
-    ignore_thresh = attrs["ignore_thresh"]
-    anchors = attrs["anchors"]
-    class_num = attrs["class_num"]
-    an_num = len(anchors) // 2
-    obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
-    tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tcls = np.zeros(
-        (n, an_num, grid_size, grid_size, class_num)).astype('float32')
 
+def YOLOv3Loss(x, gtbox, gtlabel, attrs):
+    n, c, h, w = x.shape
+    b = gtbox.shape[1]
+    anchors = attrs['anchors']
+    an_num = len(anchors) // 2
+    anchor_mask = attrs['anchor_mask']
+    mask_num = len(anchor_mask)
+    class_num = attrs["class_num"]
+    ignore_thresh = attrs['ignore_thresh']
+    downsample = attrs['downsample']
+    input_size = downsample * h
+    x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+    loss = np.zeros((n)).astype('float32')
+
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
+    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
+
+    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
+                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
+                                 class_num)
+
+    mask_anchors = []
+    for m in anchor_mask:
+        mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors])
+    anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+
+    pred_box = pred_box.reshape((n, -1, 4))
+    pred_obj = x[:, :, :, :, 4].reshape((n, -1))
+    objness = np.zeros(pred_box.shape[:2]).astype('float32')
+    ious = batch_xywh_box_iou(pred_box, gtbox)
+    ious_max = np.max(ious, axis=-1)
+    objness = np.where(ious_max > ignore_thresh, -np.ones_like(objness),
+                       objness)
+
+    gtbox_shift = gtbox.copy()
+    gtbox_shift[:, :, 0] = 0
+    gtbox_shift[:, :, 1] = 0
+
+    anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)]
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+    anchor_boxes = np.concatenate(
+        [np.zeros_like(anchors_s), anchors_s], axis=-1)
+    anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1))
+    ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes)
+    iou_matches = np.argmax(ious, axis=-1)
+    gt_matches = iou_matches.copy()
     for i in range(n):
         for j in range(b):
-            if gtboxs[i, j, :].sum() == 0:
+            if gtbox[i, j, 2:].sum() == 0:
+                gt_matches[i, j] = -1
                 continue
+            if iou_matches[i, j] not in anchor_mask:
+                gt_matches[i, j] = -1
+                continue
+            an_idx = anchor_mask.index(iou_matches[i, j])
+            gt_matches[i, j] = an_idx
+            gi = int(gtbox[i, j, 0] * w)
+            gj = int(gtbox[i, j, 1] * h)
 
-            gt_label = gtlabel[i, j]
-            gx = gtboxs[i, j, 0] * grid_size
-            gy = gtboxs[i, j, 1] * grid_size
-            gw = gtboxs[i, j, 2] * grid_size
-            gh = gtboxs[i, j, 3] * grid_size
-
-            gi = int(gx)
-            gj = int(gy)
-
-            gtbox = [0, 0, gw, gh]
-            max_iou = 0
-            for k in range(an_num):
-                anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]]
-                iou = box_iou(gtbox, anchor_box)
-                if iou > max_iou:
-                    max_iou = iou
-                    best_an_index = k
-                if iou > ignore_thresh:
-                    noobj_mask[i, best_an_index, gj, gi] = 0
-
-            obj_mask[i, best_an_index, gj, gi] = 1
-            noobj_mask[i, best_an_index, gj, gi] = 0
-            tx[i, best_an_index, gj, gi] = gx - gi
-            ty[i, best_an_index, gj, gi] = gy - gj
-            tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
-                                                               best_an_index])
-            th[i, best_an_index, gj, gi] = np.log(
-                gh / anchors[2 * best_an_index + 1])
-            tconf[i, best_an_index, gj, gi] = 1
-            tcls[i, best_an_index, gj, gi, gt_label] = 1
-
-    return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
-
-
-def YoloV3Loss(x, gtbox, gtlabel, attrs):
-    n, c, h, w = x.shape
-    an_num = len(attrs['anchors']) // 2
-    class_num = attrs["class_num"]
-    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
-    pred_x = sigmoid(x[:, :, :, :, 0])
-    pred_y = sigmoid(x[:, :, :, :, 1])
-    pred_w = x[:, :, :, :, 2]
-    pred_h = x[:, :, :, :, 3]
-    pred_conf = sigmoid(x[:, :, :, :, 4])
-    pred_cls = sigmoid(x[:, :, :, :, 5:])
-
-    tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
-        gtbox, gtlabel, attrs, x.shape[2])
-
-    obj_mask_expand = np.tile(
-        np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
-    loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
-    loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
-    loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
-    loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
-    loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
-    loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
-                             noobj_mask)
-    loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
-                     obj_mask_expand)
-
-    return attrs['loss_weight_xy'] * (loss_x + loss_y) \
-            + attrs['loss_weight_wh'] * (loss_w + loss_h) \
-            + attrs['loss_weight_conf_target'] * loss_conf_target \
-            + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
-            + attrs['loss_weight_class'] * loss_class
+            tx = gtbox[i, j, 0] * w - gi
+            ty = gtbox[i, j, 1] * w - gj
+            tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
+            th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
+            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
+            loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
+            loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
+            loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
+            loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
+
+            objness[i, an_idx * h * w + gj * w + gi] = 1.0
+
+            for label_idx in range(class_num):
+                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
+                               float(label_idx == gtlabel[i, j]))
+
+        for j in range(mask_num * h * w):
+            if objness[i, j] > 0:
+                loss[i] += sce(pred_obj[i, j], 1.0)
+            elif objness[i, j] == 0:
+                loss[i] += sce(pred_obj[i, j], 0.0)
+
+    return (loss, objness.reshape((n, mask_num, h, w)).astype('float32'), \
+            gt_matches.astype('int32'))
 
 
 class TestYolov3LossOp(OpTest):
     def setUp(self):
-        self.loss_weight_xy = 1.0
-        self.loss_weight_wh = 1.0
-        self.loss_weight_conf_target = 1.0
-        self.loss_weight_conf_notarget = 1.0
-        self.loss_weight_class = 1.0
         self.initTestCase()
         self.op_type = 'yolov3_loss'
-        x = np.random.random(size=self.x_shape).astype('float32')
+        x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
         gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
-        gtlabel = np.random.randint(0, self.class_num,
-                                    self.gtbox_shape[:2]).astype('int32')
+        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
+        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
+        gtbox = gtbox * gtmask[:, :, np.newaxis]
+        gtlabel = gtlabel * gtmask
 
         self.attrs = {
             "anchors": self.anchors,
+            "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "loss_weight_xy": self.loss_weight_xy,
-            "loss_weight_wh": self.loss_weight_wh,
-            "loss_weight_conf_target": self.loss_weight_conf_target,
-            "loss_weight_conf_notarget": self.loss_weight_conf_notarget,
-            "loss_weight_class": self.loss_weight_class,
+            "downsample": self.downsample,
         }
 
-        self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
+        self.inputs = {
+            'X': x,
+            'GTBox': gtbox.astype('float32'),
+            'GTLabel': gtlabel.astype('int32'),
+        }
+        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
         self.outputs = {
-            'Loss': np.array(
-                [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32')
+            'Loss': loss,
+            'ObjectnessMask': objness,
+            "GTMatchMask": gt_matches
         }
 
     def test_check_output(self):
@@ -196,19 +201,16 @@ class TestYolov3LossOp(OpTest):
             place, ['X'],
             'Loss',
             no_grad_set=set(["GTBox", "GTLabel"]),
-            max_relative_error=0.06)
+            max_relative_error=0.3)
 
     def initTestCase(self):
-        self.anchors = [10, 13, 12, 12]
-        self.class_num = 10
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.anchor_mask = [1, 2]
+        self.class_num = 5
         self.ignore_thresh = 0.5
-        self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7)
-        self.gtbox_shape = (5, 10, 4)
-        self.loss_weight_xy = 2.5
-        self.loss_weight_wh = 0.8
-        self.loss_weight_conf_target = 1.5
-        self.loss_weight_conf_notarget = 0.5
-        self.loss_weight_class = 1.2
+        self.downsample = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
 
 
 if __name__ == "__main__":