diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc
index c76767dfdd464769ff8962a0512fd6a7705bef6c..3bd0db8b592bdceba3dab670394434ae6872eda1 100644
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@@ -34,11 +34,12 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
     auto dim_gtbox = ctx->GetInputDim("GTBox");
     auto dim_gtlabel = ctx->GetInputDim("GTLabel");
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
     auto class_num = ctx->Attrs().Get<int>("class_num");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
     PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
                       "Input(X) dim[3] and dim[4] should be euqal.");
-    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
+    PADDLE_ENFORCE_EQ(dim_x[1], anchor_num * (5 + class_num),
                       "Input(X) dim[1] should be equal to (anchor_number * (5 "
                       "+ class_num)).");
     PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
@@ -105,20 +106,6 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(406);
     AddAttr<float>("ignore_thresh",
                    "The ignore threshold to ignore confidence loss.");
-    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>(
-        "loss_weight_conf_target",
-        "The weight of confidence score loss in locations with target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_conf_notarget",
-                   "The weight of confidence score loss in locations without "
-                   "target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
-        .SetDefault(1.0);
     AddComment(R"DOC(
          This operator generate yolov3 loss by given predict result and ground
          truth boxes.
diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h
index d0064a81902b81e6ab2605bb7dd1af5f164ec73d..5de5b4efc797c6c5e95a8f5da7b440f30ca04909 100644
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ b/paddle/fluid/operators/yolov3_loss_op.h
@@ -164,48 +164,50 @@ static inline void CalcSCEGradWithWeight(const T* loss_grad, Tensor* grad,
   }
 }
 
-template <typename T>
-static void SplitPredResult(const Tensor& input, Tensor* pred_conf,
-                            Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
-                            Tensor* pred_w, Tensor* pred_h,
-                            const int anchor_num, const int class_num) {
-  const int n = input.dims()[0];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
-  const int box_attr_num = 5 + class_num;
-
-  auto input_t = EigenTensor<T, 4>::From(input);
-  auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
-  auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
-  auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
-  auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
-
-  for (int i = 0; i < n; i++) {
-    for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-      for (int j = 0; j < h; j++) {
-        for (int k = 0; k < w; k++) {
-          pred_x_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx, j, k);
-          pred_y_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 1, j, k);
-          pred_w_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 2, j, k);
-          pred_h_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 3, j, k);
-
-          pred_conf_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 4, j, k);
-
-          for (int c = 0; c < class_num; c++) {
-            pred_class_t(i, an_idx, j, k, c) =
-                input_t(i, box_attr_num * an_idx + 5 + c, j, k);
-          }
-        }
-      }
-    }
-  }
-}
+// template <typename T>
+// static void SplitPredResult(const Tensor& input, Tensor* pred_conf,
+//                             Tensor* pred_class, Tensor* pred_x, Tensor*
+//                             pred_y,
+//                             Tensor* pred_w, Tensor* pred_h,
+//                             const int anchor_num, const int class_num) {
+//   const int n = input.dims()[0];
+//   const int h = input.dims()[2];
+//   const int w = input.dims()[3];
+//   const int box_attr_num = 5 + class_num;
+//
+//   auto input_t = EigenTensor<T, 4>::From(input);
+//   auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
+//   auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
+//   auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
+//   auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
+//   auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
+//   auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
+//
+//   for (int i = 0; i < n; i++) {
+//     for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
+//       for (int j = 0; j < h; j++) {
+//         for (int k = 0; k < w; k++) {
+//           pred_x_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx, j,
+//           k);
+//           pred_y_t(i, an_idx, j, k) =
+//               input_t(i, box_attr_num * an_idx + 1, j, k);
+//           pred_w_t(i, an_idx, j, k) =
+//               input_t(i, box_attr_num * an_idx + 2, j, k);
+//           pred_h_t(i, an_idx, j, k) =
+//               input_t(i, box_attr_num * an_idx + 3, j, k);
+//
+//           pred_conf_t(i, an_idx, j, k) =
+//               input_t(i, box_attr_num * an_idx + 4, j, k);
+//
+//           for (int c = 0; c < class_num; c++) {
+//             pred_class_t(i, an_idx, j, k, c) =
+//                 input_t(i, box_attr_num * an_idx + 5 + c, j, k);
+//           }
+//         }
+//       }
+//     }
+//   }
+// }
 
 template <typename T>
 static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
@@ -235,7 +237,7 @@ template <typename T>
 static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
                             const float ignore_thresh, std::vector<int> anchors,
                             const int input_size, const int grid_size,
-                            Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx,
+                            Tensor* conf_mask, Tensor* obj_mask, Tensor* tx,
                             Tensor* ty, Tensor* tw, Tensor* th, Tensor* tweight,
                             Tensor* tconf, Tensor* tclass) {
   const int n = gt_box.dims()[0];
@@ -243,8 +245,8 @@ static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
   const int anchor_num = anchors.size() / 2;
   auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
   auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
-  auto obj_mask_t = EigenTensor<T, 4>::From(*obj_mask).setConstant(0);
-  auto noobj_mask_t = EigenTensor<T, 4>::From(*noobj_mask).setConstant(1);
+  auto conf_mask_t = EigenTensor<T, 4>::From(*conf_mask).setConstant(1.0);
+  auto obj_mask_t = EigenTensor<T, 4>::From(*obj_mask).setConstant(0.0);
   auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
   auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
   auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
@@ -280,11 +282,11 @@ static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
           best_an_index = an_idx;
         }
         if (iou > ignore_thresh) {
-          noobj_mask_t(i, an_idx, gj, gi) = static_cast<T>(0.0);
+          conf_mask_t(i, an_idx, gj, gi) = static_cast<T>(0.0);
         }
       }
+      conf_mask_t(i, best_an_index, gj, gi) = static_cast<T>(1.0);
       obj_mask_t(i, best_an_index, gj, gi) = static_cast<T>(1.0);
-      noobj_mask_t(i, best_an_index, gj, gi) = static_cast<T>(0.0);
       tx_t(i, best_an_index, gj, gi) = gx - gi;
       ty_t(i, best_an_index, gj, gi) = gy - gj;
       tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
@@ -298,53 +300,194 @@ static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
 }
 
 template <typename T>
-static void AddAllGradToInputGrad(
-    Tensor* grad, const Tensor& grad_x, const Tensor& grad_y,
-    const Tensor& grad_w, const Tensor& grad_h, const Tensor& grad_conf_target,
-    const Tensor& grad_conf_notarget, const Tensor& grad_class,
-    const int class_num, const float loss_weight_xy, const float loss_weight_wh,
-    const float loss_weight_conf_target, const float loss_weight_conf_notarget,
-    const float loss_weight_class) {
-  const int n = grad_x.dims()[0];
-  const int an_num = grad_x.dims()[1];
-  const int h = grad_x.dims()[2];
-  const int w = grad_x.dims()[3];
-  const int attr_num = class_num + 5;
-  auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
-  auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
-  auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
-  auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
-  auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
-  auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
-  auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
-  auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
+static T SCE(T x, T label) {
+  return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x)));
+}
+
+template <typename T>
+static T L1Loss(T x, T y) {
+  return std::abs(y - x);
+}
+
+template <typename T>
+static T SCEGrad(T x, T label) {
+  return 1.0 / (1.0 + std::exp(-x)) - label;
+}
+
+template <typename T>
+static T L1LossGrad(T x, T y) {
+  return x > y ? 1.0 : -1.0;
+}
+
+template <typename T>
+static void CalcSCE(T* loss_data, const T* input, const T* target,
+                    const T* weight, const T* mask, const int n,
+                    const int an_num, const int grid_num, const int class_num,
+                    const int num) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < grid_num; k++) {
+        int sub_idx = k * num;
+        for (int l = 0; l < num; l++) {
+          loss_data[i] += SCE<T>(input[l * grid_num + k], target[sub_idx + l]) *
+                          weight[k] * mask[k];
+        }
+      }
+      input += (class_num + 5) * grid_num;
+      target += grid_num * num;
+      weight += grid_num;
+      mask += grid_num;
+    }
+  }
+}
 
+template <typename T>
+static void CalcSCEGrad(T* input_grad, const T* loss_grad, const T* input,
+                        const T* target, const T* weight, const T* mask,
+                        const int n, const int an_num, const int grid_num,
+                        const int class_num, const int num) {
   for (int i = 0; i < n; i++) {
     for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * loss_weight_xy;
-          grad_t(i, j * attr_num + 1, k, l) =
-              grad_y_t(i, j, k, l) * loss_weight_xy;
-          grad_t(i, j * attr_num + 2, k, l) =
-              grad_w_t(i, j, k, l) * loss_weight_wh;
-          grad_t(i, j * attr_num + 3, k, l) =
-              grad_h_t(i, j, k, l) * loss_weight_wh;
-          grad_t(i, j * attr_num + 4, k, l) =
-              grad_conf_target_t(i, j, k, l) * loss_weight_conf_target;
-          grad_t(i, j * attr_num + 4, k, l) +=
-              grad_conf_notarget_t(i, j, k, l) * loss_weight_conf_notarget;
-
-          for (int c = 0; c < class_num; c++) {
-            grad_t(i, j * attr_num + 5 + c, k, l) =
-                grad_class_t(i, j, k, l, c) * loss_weight_class;
-          }
+      for (int k = 0; k < grid_num; k++) {
+        int sub_idx = k * num;
+        for (int l = 0; l < num; l++) {
+          input_grad[l * grid_num + k] =
+              SCEGrad<T>(input[l * grid_num + k], target[sub_idx + l]) *
+              weight[k] * mask[k] * loss_grad[i];
         }
       }
+      input_grad += (class_num + 5) * grid_num;
+      input += (class_num + 5) * grid_num;
+      target += grid_num * num;
+      weight += grid_num;
+      mask += grid_num;
+    }
+  }
+}
+
+template <typename T>
+static void CalcL1Loss(T* loss_data, const T* input, const T* target,
+                       const T* weight, const T* mask, const int n,
+                       const int an_num, const int grid_num,
+                       const int class_num) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < grid_num; k++) {
+        loss_data[i] += L1Loss<T>(input[k], target[k]) * weight[k] * mask[k];
+      }
+      input += (class_num + 5) * grid_num;
+      target += grid_num;
+      weight += grid_num;
+      mask += grid_num;
+    }
+  }
+}
+
+template <typename T>
+static void CalcL1LossGrad(T* input_grad, const T* loss_grad, const T* input,
+                           const T* target, const T* weight, const T* mask,
+                           const int n, const int an_num, const int grid_num,
+                           const int class_num) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < grid_num; k++) {
+        input_grad[k] = L1LossGrad<T>(input[k], target[k]) * weight[k] *
+                        mask[k] * loss_grad[i];
+      }
+      input_grad += (class_num + 5) * grid_num;
+      input += (class_num + 5) * grid_num;
+      target += grid_num;
+      weight += grid_num;
+      mask += grid_num;
     }
   }
 }
 
+template <typename T>
+static void CalcYolov3Loss(T* loss_data, const Tensor& input, const Tensor& tx,
+                           const Tensor& ty, const Tensor& tw, const Tensor& th,
+                           const Tensor& tweight, const Tensor& tconf,
+                           const Tensor& tclass, const Tensor& conf_mask,
+                           const Tensor& obj_mask) {
+  const T* input_data = input.data<T>();
+  const T* tx_data = tx.data<T>();
+  const T* ty_data = ty.data<T>();
+  const T* tw_data = tw.data<T>();
+  const T* th_data = th.data<T>();
+  const T* tweight_data = tweight.data<T>();
+  const T* tconf_data = tconf.data<T>();
+  const T* tclass_data = tclass.data<T>();
+  const T* conf_mask_data = conf_mask.data<T>();
+  const T* obj_mask_data = obj_mask.data<T>();
+
+  const int n = tclass.dims()[0];
+  const int an_num = tclass.dims()[1];
+  const int h = tclass.dims()[2];
+  const int w = tclass.dims()[3];
+  const int class_num = tclass.dims()[4];
+  const int grid_num = h * w;
+
+  CalcSCE<T>(loss_data, input_data, tx_data, tweight_data, obj_mask_data, n,
+             an_num, grid_num, class_num, 1);
+  CalcSCE<T>(loss_data, input_data + grid_num, ty_data, tweight_data,
+             obj_mask_data, n, an_num, grid_num, class_num, 1);
+  CalcL1Loss<T>(loss_data, input_data + 2 * grid_num, tw_data, tweight_data,
+                obj_mask_data, n, an_num, grid_num, class_num);
+  CalcL1Loss<T>(loss_data, input_data + 3 * grid_num, th_data, tweight_data,
+                obj_mask_data, n, an_num, grid_num, class_num);
+  CalcSCE<T>(loss_data, input_data + 4 * grid_num, tconf_data, conf_mask_data,
+             conf_mask_data, n, an_num, grid_num, class_num, 1);
+  CalcSCE<T>(loss_data, input_data + 5 * grid_num, tclass_data, obj_mask_data,
+             obj_mask_data, n, an_num, grid_num, class_num, class_num);
+}
+
+template <typename T>
+static void CalcYolov3LossGrad(T* input_grad_data, const Tensor& loss_grad,
+                               const Tensor& input, const Tensor& tx,
+                               const Tensor& ty, const Tensor& tw,
+                               const Tensor& th, const Tensor& tweight,
+                               const Tensor& tconf, const Tensor& tclass,
+                               const Tensor& conf_mask,
+                               const Tensor& obj_mask) {
+  const T* loss_grad_data = loss_grad.data<T>();
+  const T* input_data = input.data<T>();
+  const T* tx_data = tx.data<T>();
+  const T* ty_data = ty.data<T>();
+  const T* tw_data = tw.data<T>();
+  const T* th_data = th.data<T>();
+  const T* tweight_data = tweight.data<T>();
+  const T* tconf_data = tconf.data<T>();
+  const T* tclass_data = tclass.data<T>();
+  const T* conf_mask_data = conf_mask.data<T>();
+  const T* obj_mask_data = obj_mask.data<T>();
+
+  const int n = tclass.dims()[0];
+  const int an_num = tclass.dims()[1];
+  const int h = tclass.dims()[2];
+  const int w = tclass.dims()[3];
+  const int class_num = tclass.dims()[4];
+  const int grid_num = h * w;
+
+  CalcSCEGrad<T>(input_grad_data, loss_grad_data, input_data, tx_data,
+                 tweight_data, obj_mask_data, n, an_num, grid_num, class_num,
+                 1);
+  CalcSCEGrad<T>(input_grad_data + grid_num, loss_grad_data,
+                 input_data + grid_num, ty_data, tweight_data, obj_mask_data, n,
+                 an_num, grid_num, class_num, 1);
+  CalcL1LossGrad<T>(input_grad_data + 2 * grid_num, loss_grad_data,
+                    input_data + 2 * grid_num, tw_data, tweight_data,
+                    obj_mask_data, n, an_num, grid_num, class_num);
+  CalcL1LossGrad<T>(input_grad_data + 3 * grid_num, loss_grad_data,
+                    input_data + 3 * grid_num, th_data, tweight_data,
+                    obj_mask_data, n, an_num, grid_num, class_num);
+  CalcSCEGrad<T>(input_grad_data + 4 * grid_num, loss_grad_data,
+                 input_data + 4 * grid_num, tconf_data, conf_mask_data,
+                 conf_mask_data, n, an_num, grid_num, class_num, 1);
+  CalcSCEGrad<T>(input_grad_data + 5 * grid_num, loss_grad_data,
+                 input_data + 5 * grid_num, tclass_data, obj_mask_data,
+                 obj_mask_data, n, an_num, grid_num, class_num, class_num);
+}
+
 template <typename T>
 class Yolov3LossKernel : public framework::OpKernel<T> {
  public:
@@ -357,33 +500,16 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     int class_num = ctx.Attr<int>("class_num");
     int input_size = ctx.Attr<int>("input_size");
     float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
 
     const int n = input->dims()[0];
     const int h = input->dims()[2];
     const int w = input->dims()[3];
     const int an_num = anchors.size() / 2;
 
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    SplitPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                       &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
+    Tensor conf_mask, obj_mask;
     Tensor tx, ty, tw, th, tweight, tconf, tclass;
+    conf_mask.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     obj_mask.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
@@ -392,35 +518,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
     PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, input_size,
-                       h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tweight,
+                       h, &conf_mask, &obj_mask, &tx, &ty, &tw, &th, &tweight,
                        &tconf, &tclass);
 
-    Tensor obj_weight;
-    obj_weight.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    auto obj_weight_t = EigenTensor<T, 4>::From(obj_weight);
-    auto obj_mask_t = EigenTensor<T, 4>::From(obj_mask);
-    auto tweight_t = EigenTensor<T, 4>::From(tweight);
-    obj_weight_t = obj_mask_t * tweight_t;
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<T>({n, an_num, h, w, class_num},
-                                    ctx.GetPlace());
-    auto obj_mask_expand_t = EigenTensor<T, 5>::From(obj_mask_expand);
-    obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
-                            .broadcast(Array5(1, 1, 1, 1, class_num));
-
     T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
     memset(loss_data, 0, n * sizeof(T));
-    CalcSCEWithWeight<T>(pred_x, tx, obj_weight, loss_weight_xy, loss_data);
-    CalcSCEWithWeight<T>(pred_y, ty, obj_weight, loss_weight_xy, loss_data);
-    CalcL1LossWithWeight<T>(pred_w, tw, obj_weight, loss_weight_wh, loss_data);
-    CalcL1LossWithWeight<T>(pred_h, th, obj_weight, loss_weight_wh, loss_data);
-    CalcSCEWithWeight<T>(pred_conf, tconf, obj_mask, loss_weight_conf_target,
-                         loss_data);
-    CalcSCEWithWeight<T>(pred_conf, tconf, noobj_mask,
-                         loss_weight_conf_notarget, loss_data);
-    CalcSCEWithWeight<T>(pred_class, tclass, obj_mask_expand, loss_weight_class,
-                         loss_data);
+    CalcYolov3Loss<T>(loss_data, *input, tx, ty, tw, th, tweight, tconf, tclass,
+                      conf_mask, obj_mask);
   }
 };
 
@@ -436,14 +540,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     float ignore_thresh = ctx.Attr<float>("ignore_thresh");
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    const T* loss_grad_data = loss_grad->data<T>();
     int input_size = ctx.Attr<int>("input_size");
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
 
     const int n = input->dims()[0];
     const int c = input->dims()[1];
@@ -451,21 +548,10 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     const int w = input->dims()[3];
     const int an_num = anchors.size() / 2;
 
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    SplitPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                       &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
+    Tensor conf_mask, obj_mask;
     Tensor tx, ty, tw, th, tweight, tconf, tclass;
+    conf_mask.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     obj_mask.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
@@ -474,51 +560,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
     tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
     PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, input_size,
-                       h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tweight,
+                       h, &conf_mask, &obj_mask, &tx, &ty, &tw, &th, &tweight,
                        &tconf, &tclass);
 
-    Tensor obj_weight;
-    obj_weight.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    auto obj_weight_t = EigenTensor<T, 4>::From(obj_weight);
-    auto obj_mask_t = EigenTensor<T, 4>::From(obj_mask);
-    auto tweight_t = EigenTensor<T, 4>::From(tweight);
-    obj_weight_t = obj_mask_t * tweight_t;
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<T>({n, an_num, h, w, class_num},
-                                    ctx.GetPlace());
-    auto obj_mask_expand_t = EigenTensor<T, 5>::From(obj_mask_expand);
-    obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
-                            .broadcast(Array5(1, 1, 1, 1, class_num));
-
-    Tensor grad_x, grad_y, grad_w, grad_h;
-    Tensor grad_conf_target, grad_conf_notarget, grad_class;
-    grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcSCEGradWithWeight<T>(loss_grad_data, &grad_x, pred_x, tx, obj_weight);
-    CalcSCEGradWithWeight<T>(loss_grad_data, &grad_y, pred_y, ty, obj_weight);
-    CalcL1LossGradWithWeight<T>(loss_grad_data, &grad_w, pred_w, tw,
-                                obj_weight);
-    CalcL1LossGradWithWeight<T>(loss_grad_data, &grad_h, pred_h, th,
-                                obj_weight);
-    CalcSCEGradWithWeight<T>(loss_grad_data, &grad_conf_target, pred_conf,
-                             tconf, obj_mask);
-    CalcSCEGradWithWeight<T>(loss_grad_data, &grad_conf_notarget, pred_conf,
-                             tconf, noobj_mask);
-    CalcSCEGradWithWeight<T>(loss_grad_data, &grad_class, pred_class, tclass,
-                             obj_mask_expand);
-
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    AddAllGradToInputGrad<T>(input_grad, grad_x, grad_y, grad_w, grad_h,
-                             grad_conf_target, grad_conf_notarget, grad_class,
-                             class_num, loss_weight_xy, loss_weight_wh,
-                             loss_weight_conf_target, loss_weight_conf_notarget,
-                             loss_weight_class);
+    T* input_grad_data =
+        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    CalcYolov3LossGrad<T>(input_grad_data, *loss_grad, *input, tx, ty, tw, th,
+                          tweight, tconf, tclass, conf_mask, obj_mask);
   }
 };
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 5fb4588e0b977d185a7d70d11527a73a50277253..caa9b1c3d4723cfab690e42221c3437e70705ef2 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -416,11 +416,6 @@ def yolov3_loss(x,
                 class_num,
                 ignore_thresh,
                 input_size,
-                loss_weight_xy=None,
-                loss_weight_wh=None,
-                loss_weight_conf_target=None,
-                loss_weight_conf_notarget=None,
-                loss_weight_class=None,
                 name=None):
     """
     ${comment}
@@ -438,11 +433,6 @@ def yolov3_loss(x,
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
         input_size (int): ${input_size_comment}
-        loss_weight_xy (float|None): ${loss_weight_xy_comment}
-        loss_weight_wh (float|None): ${loss_weight_wh_comment}
-        loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
-        loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
-        loss_weight_class (float|None): ${loss_weight_class_comment}
         name (string): the name of yolov3 loss
 
     Returns:
@@ -495,18 +485,18 @@ def yolov3_loss(x,
         "input_size": input_size,
     }
 
-    if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
-        self.attrs['loss_weight_xy'] = loss_weight_xy
-    if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
-        self.attrs['loss_weight_wh'] = loss_weight_wh
-    if loss_weight_conf_target is not None and isinstance(
-            loss_weight_conf_target, float):
-        self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
-    if loss_weight_conf_notarget is not None and isinstance(
-            loss_weight_conf_notarget, float):
-        self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
-    if loss_weight_class is not None and isinstance(loss_weight_class, float):
-        self.attrs['loss_weight_class'] = loss_weight_class
+    # if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
+    #     self.attrs['loss_weight_xy'] = loss_weight_xy
+    # if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
+    #     self.attrs['loss_weight_wh'] = loss_weight_wh
+    # if loss_weight_conf_target is not None and isinstance(
+    #         loss_weight_conf_target, float):
+    #     self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
+    # if loss_weight_conf_notarget is not None and isinstance(
+    #         loss_weight_conf_notarget, float):
+    #     self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
+    # if loss_weight_class is not None and isinstance(loss_weight_class, float):
+    #     self.attrs['loss_weight_class'] = loss_weight_class
 
     helper.append_op(
         type='yolov3_loss',
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 9cf398f18f953f96908959da0c5ed72fd7f43722..0fe836683b029698b670bbb9f9bb258c2f3b68a0 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -470,8 +470,6 @@ class OpTest(unittest.TestCase):
         ]
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set)
-        # print(numeric_grads[0][0, 4, :, :])
-        # print(analytic_grads[0][0, 4, :, :])
 
         self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
                               max_relative_error,
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index e218031286fc221dab294cfca70ca81a86285856..cf7e2c52893ea4c2fb80ca24bb11d553e3dc19da 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -80,8 +80,8 @@ def build_target(gtboxes, gtlabel, attrs, grid_size):
     class_num = attrs["class_num"]
     input_size = attrs["input_size"]
     an_num = len(anchors) // 2
+    conf_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
     obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
     tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
     ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
     tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
@@ -114,10 +114,10 @@ def build_target(gtboxes, gtlabel, attrs, grid_size):
                     max_iou = iou
                     best_an_index = k
                 if iou > ignore_thresh:
-                    noobj_mask[i, best_an_index, gj, gi] = 0
+                    conf_mask[i, best_an_index, gj, gi] = 0
 
+            conf_mask[i, best_an_index, gj, gi] = 1
             obj_mask[i, best_an_index, gj, gi] = 1
-            noobj_mask[i, best_an_index, gj, gi] = 0
             tx[i, best_an_index, gj, gi] = gx - gi
             ty[i, best_an_index, gj, gi] = gy - gj
             tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
@@ -129,7 +129,7 @@ def build_target(gtboxes, gtlabel, attrs, grid_size):
             tconf[i, best_an_index, gj, gi] = 1
             tcls[i, best_an_index, gj, gi, gt_label] = 1
 
-    return (tx, ty, tw, th, tweight, tconf, tcls, obj_mask, noobj_mask)
+    return (tx, ty, tw, th, tweight, tconf, tcls, conf_mask, obj_mask)
 
 
 def YoloV3Loss(x, gtbox, gtlabel, attrs):
@@ -144,11 +144,9 @@ def YoloV3Loss(x, gtbox, gtlabel, attrs):
     pred_conf = x[:, :, :, :, 4]
     pred_cls = x[:, :, :, :, 5:]
 
-    tx, ty, tw, th, tweight, tconf, tcls, obj_mask, noobj_mask = build_target(
+    tx, ty, tw, th, tweight, tconf, tcls, conf_mask, obj_mask = build_target(
         gtbox, gtlabel, attrs, x.shape[2])
 
-    # print("obj_mask: ", obj_mask[0, 0, :, :])
-    # print("noobj_mask: ", noobj_mask[0, 0, :, :])
     obj_weight = obj_mask * tweight
     obj_mask_expand = np.tile(
         np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
@@ -156,30 +154,19 @@ def YoloV3Loss(x, gtbox, gtlabel, attrs):
     loss_y = sce(pred_y, ty, obj_weight)
     loss_w = l1loss(pred_w, tw, obj_weight)
     loss_h = l1loss(pred_h, th, obj_weight)
-    loss_conf_target = sce(pred_conf, tconf, obj_mask)
-    loss_conf_notarget = sce(pred_conf, tconf, noobj_mask)
+    loss_obj = sce(pred_conf, tconf, conf_mask)
     loss_class = sce(pred_cls, tcls, obj_mask_expand)
 
-    # print("loss_xy: ", loss_x + loss_y)
-    # print("loss_wh: ", loss_w + loss_h)
-    # print("loss_conf_target: ", loss_conf_target)
-    # print("loss_conf_notarget: ", loss_conf_notarget)
-    # print("loss_class: ", loss_class)
+    # print("python loss_xy: ", loss_x + loss_y)
+    # print("python loss_wh: ", loss_w + loss_h)
+    # print("python loss_obj: ", loss_obj)
+    # print("python loss_class: ", loss_class)
 
-    return attrs['loss_weight_xy'] * (loss_x + loss_y) \
-            + attrs['loss_weight_wh'] * (loss_w + loss_h) \
-            + attrs['loss_weight_conf_target'] * loss_conf_target \
-            + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
-            + attrs['loss_weight_class'] * loss_class
+    return loss_x + loss_y + loss_w + loss_h + loss_obj + loss_class
 
 
 class TestYolov3LossOp(OpTest):
     def setUp(self):
-        self.loss_weight_xy = 1.0
-        self.loss_weight_wh = 1.0
-        self.loss_weight_conf_target = 1.0
-        self.loss_weight_conf_notarget = 1.0
-        self.loss_weight_class = 1.0
         self.initTestCase()
         self.op_type = 'yolov3_loss'
         x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
@@ -192,11 +179,6 @@ class TestYolov3LossOp(OpTest):
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
             "input_size": self.input_size,
-            "loss_weight_xy": self.loss_weight_xy,
-            "loss_weight_wh": self.loss_weight_wh,
-            "loss_weight_conf_target": self.loss_weight_conf_target,
-            "loss_weight_conf_notarget": self.loss_weight_conf_notarget,
-            "loss_weight_class": self.loss_weight_class,
         }
 
         self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
@@ -215,17 +197,12 @@ class TestYolov3LossOp(OpTest):
             max_relative_error=0.31)
 
     def initTestCase(self):
-        self.anchors = [12, 12]
+        self.anchors = [12, 12, 11, 13]
         self.class_num = 5
-        self.ignore_thresh = 0.3
+        self.ignore_thresh = 0.5
         self.input_size = 416
         self.x_shape = (3, len(self.anchors) // 2 * (5 + self.class_num), 5, 5)
         self.gtbox_shape = (3, 5, 4)
-        self.loss_weight_xy = 1.2
-        self.loss_weight_wh = 0.8
-        self.loss_weight_conf_target = 2.0
-        self.loss_weight_conf_notarget = 1.0
-        self.loss_weight_class = 1.5
 
 
 if __name__ == "__main__":