fix merge issue

11bcb43a · typhoonzero · a8b630c8 · 4f4abfa3 · 11bcb43a · 11bcb43a
43 changed file
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -2,7 +2,7 @@
 |---|---|
 | backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
-| Canpio | Jia-Yi Feng |
+| JiayiFeng | Jia-Yi Feng |
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |

--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -82,7 +82,7 @@ language = 'zh_CN'
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_en*', '*_en*']
+exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']
 # The reST default role (used for this markup: `text`) to use for all
 # documents.

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -82,7 +82,7 @@ language = None
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']
 # The reST default role (used for this markup: `text`) to use for all
 # documents.

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -11,7 +11,6 @@ if(MOBILE_INFERENCE)
 else()
  add_subdirectory(pserver)
  add_subdirectory(trainer)
-  add_subdirectory(string)
  add_subdirectory(scripts)
  if(WITH_C_API)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -4,3 +4,4 @@ add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(inference)
+add_subdirectory(string)
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -314,5 +314,15 @@ DDim stride(const DDim& ddim) {
  }
  return framework::make_ddim(strides);
 }
+DDim stride_numel(const framework::DDim& ddim) {
+  std::vector<int64_t> strides(ddim.size());
+  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i];
+  }
+  return framework::make_ddim(strides);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -125,6 +125,8 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims);
 DDim flatten_to_1d(const DDim& src);
 DDim stride(const DDim& ddim);
+DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/string/piece.h"
+#include "paddle/fluid/string/piece.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -37,9 +37,8 @@ class Vector {
  // Fill vector with value. The vector size is `count`.
  explicit Vector(size_t count, const T& value = T()) {
-    if (count == 0) {
+    InitEmpty();
-      InitEmpty();
+    if (count != 0) {
-    } else {
      resize(count);
      T* ptr = begin();
      for (size_t i = 0; i < count; ++i) {
@@ -122,6 +121,10 @@ class Vector {
  const T* begin() const { return &this->operator[](0); }
  const T* end() const { return &this->operator[](size()); }
+  const T* cbegin() const { return begin(); }
+  const T* cend() const { return end(); }
  const T& back() const {
    auto it = end();
    --it;
@@ -244,7 +247,9 @@ class Vector {
  bool operator==(const Vector<T>& other) const {
    if (size() != other.size()) return false;
-    for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
      if (*it1 != *it2) {
        return false;
      }

--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -26,10 +26,10 @@ TEST(mixed_vector, CPU_VECTOR) {
  for (int i = 0; i < 10; ++i) {
    tmp.push_back(i);
  }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
  vec<int> tmp2;
  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10);
+  ASSERT_EQ(tmp2.size(), 10UL);
  for (int i = 0; i < 10; ++i) {
    ASSERT_EQ(tmp2[i], i);
    ASSERT_EQ(tmp2[i], tmp[i]);
@@ -58,7 +58,7 @@ TEST(mixed_vector, GPU_VECTOR) {
  for (int i = 0; i < 10; ++i) {
    tmp.push_back(i);
  }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu(0);
  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
@@ -79,7 +79,7 @@ TEST(mixed_vector, MultiGPU) {
  for (int i = 0; i < 10; ++i) {
    tmp.push_back(i);
  }
-  ASSERT_EQ(tmp.size(), 10);
+  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu0(0);
  paddle::platform::SetDeviceId(0);
  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
@@ -91,3 +91,10 @@ TEST(mixed_vector, MultiGPU) {
    ASSERT_EQ(tmp[i], i * 100);
  }
 }
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <mutex>   // for call_once
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"
 DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "

--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -28,17 +28,18 @@ class ConcatKernel : public framework::OpKernel<T> {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
    auto* out = ctx.Output<framework::Tensor>("Out");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = ins.size();
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    auto out_stride = framework::stride_numel(out->dims());
    size_t output_offset = 0;
-    out->mutable_data<T>(ctx.GetPlace());
+    for (auto* in : ins) {
-    auto out_stride = framework::stride(out->dims());
+      auto in_stride = framework::stride_numel(in->dims());
-    for (size_t i = 0; i < n; i++) {
+      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
-      auto& in = ins[i];
+                                  out->data<T>() + output_offset, out_stride,
-      auto axis_dim = in->dims()[axis];
+                                  in->data<T>(), in_stride);
-      auto in_stride = framework::stride(in->dims());
+      output_offset += in_stride[axis];
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
-                       in->dims(), out_stride, out->data<T>() + output_offset);
-      output_offset += axis_dim * in_stride[axis];
    }
  }
 };
@@ -50,17 +51,16 @@ class ConcatGradKernel : public framework::OpKernel<T> {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = outs.size();
    size_t input_offset = 0;
-    auto in_stride = framework::stride(in->dims());
+    auto in_stride = framework::stride_numel(in->dims());
-    for (size_t i = 0; i < n; i++) {
-      auto& out = outs[i];
+    for (auto& out : outs) {
      out->mutable_data<T>(ctx.GetPlace());
-      size_t axis_dim = out->dims()[axis];
+      auto out_stride = framework::stride_numel(out->dims());
-      auto out_stride = framework::stride(out->dims());
+      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
+                                  out_stride, in->data<T>() + input_offset,
-                       in_stride, out->dims(), out_stride, out->data<T>());
+                                  in_stride);
-      input_offset += axis_dim * in_stride[axis];
+      input_offset += out_stride[axis];
    }
  }
 };

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"
 namespace paddle {
 namespace operators {
@@ -98,6 +98,7 @@ class ListenAndServOp : public framework::OperatorBase {
      // the gradients arrives, just add suffix 0~n and merge the gradient.
      rpc_service_->SetCond(0);
      size_t recv_var_cnt = 0;
+      size_t update_param_cnt = 0;
      int batch_barrier = 0;
      while (batch_barrier != fan_in) {
        const detail::MessageWithName &v = rpc_service_->Get();
@@ -122,11 +123,10 @@ class ListenAndServOp : public framework::OperatorBase {
        }
      }
      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
-      // TODO(Yancey1989): merge SelectedRows variables here
      if (exit_flag) {
        rpc_service_->ShutDown();
      }
+      VLOG(3) << "run optimize graph...";
      try {
        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
                     false /*create_local_scope*/, false /*create_vars*/);
@@ -134,7 +134,7 @@ class ListenAndServOp : public framework::OperatorBase {
        LOG(ERROR) << "run sub program error " << e.what();
      }
      rpc_service_->SetCond(1);
-      rpc_service_->WaitClientGet(recv_var_cnt);
+      rpc_service_->WaitClientGet(update_param_cnt);
      grads_counter_.clear();
    }  // while(true)
  }

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"
 USE_NO_KERNEL_OP(send);
 USE_NO_KERNEL_OP(listen_and_serv);

--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -29,7 +29,9 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"));
    PADDLE_ENFORCE(ctx->HasInput("Y"));
    framework::DDim out_dim;
-    out_dim = ctx->GetInputDim("Y");
+    auto y_dim = ctx->GetInputDim("Y");
+    out_dim = ctx->GetInputDim("X");
+    out_dim[0] = y_dim[0];
    ctx->ShareLoD("Y", "Out");
    ctx->SetOutputDim("Out", out_dim);
  }

--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <chrono>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@@ -27,18 +28,18 @@ class SplitOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<framework::Tensor>("X");
    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride(in->dims());
+    auto in_stride = framework::stride_numel(in->dims());
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    const size_t n = outs.size();
+    auto place = ctx.GetPlace();
    size_t input_offset = 0;
-    for (size_t i = 0; i < n; i++) {
+    for (auto& out : outs) {
-      auto& out = outs[i];
      out->mutable_data<T>(ctx.GetPlace());
-      size_t axis_dim = out->dims()[axis];
+      auto out_stride = framework::stride_numel(out->dims());
-      auto out_stride = framework::stride(out->dims());
+      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
+                                  out_stride, in->data<T>() + input_offset,
-                       in_stride, out->dims(), out_stride, out->data<T>());
+                                  in_stride);
-      input_offset += axis_dim * in_stride[axis];
+      input_offset += out_stride[axis];
    }
  }
 };

--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -41,5 +41,62 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
  boost::apply_visitor(func, dst_dim);
 }
+// Strided numel memory copy from src to dst by the specified axis
+//
+// For example, for a tensor dims [4, 20, 100], the strieded numel is
+// [8000, 2000, 100]
+//
+// NOTE: The src and dst tensor should have the same elements
+// except the specified axis.
+template <typename T>
+inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
+                                     int64_t axis, T* dst,
+                                     const framework::DDim& dst_stride_numel,
+                                     const T* src,
+                                     const framework::DDim& src_stride_numel) {
+  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
+  int64_t src_after = src_stride_numel[axis];
+  int64_t dst_after = dst_stride_numel[axis];
+  auto place = ctx.GetPlace();
+  PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
+                    "src and dst tensor should have the same dims size.");
+  for (int64_t i = 0; i < axis; ++i) {
+    if (i < axis) {
+      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
+                        dst_stride_numel[i] / dst_stride_numel[axis],
+                        "src and dst should have the same elements "
+                        "except the specified axis.");
+    } else if (i == axis) {
+      continue;
+    } else {
+      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
+                        "src and dst should have the same elements "
+                        "except the specified axis.");
+    }
+  }
+  for (int64_t i = 0; i < before; ++i) {
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
+                   src + i * src_after, sizeof(T) * src_after);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
+      memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
+                   src + i * src_after, sizeof(T) * src_after,
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+}
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
@@ -22,69 +22,43 @@ class TargetAssignOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    // checkout inputs
+    PADDLE_ENFORCE(ctx->HasInput("X"),
-    PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
+                   "Input(X) of TargetAssignOp should not be null");
-                   "Input(EncodedGTBBox) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
-                   "Input(GTScoreLabel) of TargetAssignOp should not be null");
    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
                   "Input(MatchIndices) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
-                   "Input(NegIndices) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TargetAssignOp should not be null.");
-    // checkout outputs
+    PADDLE_ENFORCE(ctx->HasOutput("OutWeight"),
-    PADDLE_ENFORCE(
+                   "Output(OutWeight) of TargetAssignOp should not be null.");
-        ctx->HasOutput("PredBBoxLabel"),
-        "Output(PredBBoxLabel) of TargetAssignOp should not be null.");
+    auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredBBoxWeight"),
-        "Output(PredBBoxWeight) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredScoreLabel"),
-        "Output(PredScoreLabel) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("PredScoreWeight"),
-        "Output(PredScoreWeight) of TargetAssignOp should not be null.");
-    auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
-    auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
    auto mi_dims = ctx->GetInputDim("MatchIndices");
-    auto neg_dims = ctx->GetInputDim("NegIndices");
-    PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "The rank of Input(X) must be 3.");
-                      "The rank of Input(EncodedGTBBox) must be 3.");
+    PADDLE_ENFORCE_EQ(mi_dims.size(), 2,
-    PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
-                      "The rank of Input(GTScoreLabel) must be 2.");
-    PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
                      "The rank of Input(MatchIndices) must be 2.");
-    PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
-                      "The rank of Input(NegIndices) must be 2.");
+    if (ctx->HasInput("NegIndices")) {
+      auto neg_dims = ctx->GetInputDim("NegIndices");
-    PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
+      PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
-                      "The 1st dimension (means the total number of "
+                        "The rank of Input(NegIndices) must be 2.");
-                      "ground-truth bounding boxes) of Input(EncodedGTBBox) "
+      PADDLE_ENFORCE_EQ(neg_dims[1], 1,
-                      "and Input(GTScoreLabel) must be the same.");
+                        "The last dimenstion of Out(NegIndices) must be 1.");
-    PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
+    }
-                      "The 2nd dimension (means the number of priod boxes) "
-                      "of Input(EncodedGTBBox) and "
-                      "Input(MatchIndices) must be the same.");
-    PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
-                      "The 3rd dimension of Input(EncodedGTBBox) must be 4.");
    auto n = mi_dims[0];
-    auto np = mi_dims[1];
+    auto m = mi_dims[1];
-    ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
+    auto k = in_dims[in_dims.size() - 1];
-    ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
+    ctx->SetOutputDim("Out", {n, m, k});
-    ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
+    ctx->SetOutputDim("OutWeight", {n, m, 1});
-    ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        framework::ToDataType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-            ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
        ctx.device_context());
  }
 };
@@ -93,102 +67,87 @@ class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("EncodedGTBBox",
+    AddInput("X",
-             "(LoDTensor), The encoded ground-truth bounding boxes with shape "
+             "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. "
-             "[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
+             "Some elements in X will be assigned to Out based on the "
-             "in this mini-batch, Np the number of predictions, 4 is the "
+             "MatchIndices and NegIndices.");
-             "number of coordinate in [xmin, ymin, xmax, ymax] layout.");
-    AddInput("GTScoreLabel",
-             "(LoDTensor, default LoDTensor<int>),  The input ground-truth "
-             "labels with shape [Ng, 1], where the Ng is the same as it in "
-             "the input of EncodedGTBBox.");
    AddInput("MatchIndices",
             "(Tensor, default Tensor<int>), The input matched indices "
-             "with shape [N, Np], where N is the batch size, Np is the same "
+             "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity "
-             "as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
+             "of column is not matched to any entity of row in i-th instance.");
-             "is -1, the j-th prior box is not matched to any ground-truh "
-             "box in i-th instance.");
    AddInput("NegIndices",
             "(LoDTensor, default LoDTensor<int>), The input negative example "
-             "indices with shape [Neg, 1], where is the total number of "
+             "indices are an optional input with shape [Neg, 1], where Neg is "
-             "negative example indices.");
+             "the total number of negative example indices.")
-    AddAttr<int>("background_label",
+        .AsDispensable();
-                 "(int, default 0), Label index of background class.")
+    AddAttr<int>("mismatch_value",
+                 "(int, default 0), Fill this value to the "
+                 "mismatched location.")
        .SetDefault(0);
-    AddOutput("PredBBoxLabel",
+    AddOutput("Out",
-              "(Tensor), The output encoded ground-truth labels "
+              "(Tensor), The output is a 3D Tensor with shape [N, P, K], "
-              "with shape [N, Np, 4], N is the batch size and Np, 4 is the "
+              "N and P is the same as they are in NegIndices, K is the "
-              "same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
+              "same as it in input of X. If MatchIndices[i][j] "
-              "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
+              "is -1, the Out[i][j][0 : K] is the mismatch_value.");
-              "box for background_label in i-th instance.");
+    AddOutput("OutWeight",
-    AddOutput("PredBBoxWeight",
+              "(Tensor), The weight for output with the shape of [N, P, 1]");
-              "(Tensor), The weight for PredBBoxLabel with the shape "
-              "of [N, Np, 1]");
-    AddOutput("PredScoreLabel",
-              "(Tensor, default Tensor<int>), The output score labels for "
-              "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
-              "is -1, PredScoreLabel[i][j] = background_label.");
-    AddOutput("PredScoreWeight",
-              "(Tensor), The weight for PredScoreLabel with the shape "
-              "of [N, Np, 1]");
    AddComment(R"DOC(
-This operator is, for given the encoded boxes between prior boxes and
+This operator can be, for given the target bounding boxes or labels,
-ground-truth boxes and ground-truth class labels, to assign classification
+to assign classification and regression targets to each prediction as well as
-and regression targets to each prior box as well as weights to each
+weights to prediction. The weights is used to specify which prediction would
-prior box. The weights is used to specify which prior box would not contribute
+not contribute to training loss.
-to training loss.
+For each instance, the output `Out` and`OutWeight` are assigned based on
-For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
+`MatchIndices` and `NegIndices`.
-`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
+Assumed that the row offset for each instance in `X` is called lod,
-Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
+this operator assigns classification/regression targets by performing the
-this operato assigns classification/regression targets by performing the
 following steps:
 1. Assigning all outpts based on `MatchIndices`:
 If id = MatchIndices[i][j] > 0,
-    PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
+    Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-    PredBBoxWeight[i][j] = 1.
+    OutWeight[i][j] = 1.
-    PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
-    PredScoreWeight[i][j] = 1.
 Otherwise, 
-    PredBBoxLabel[j][j] = [0., 0., 0., 0.]
+    Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-    PredBBoxWeight[i][j] = 0.
+    OutWeight[i][j] = 0.
-    PredScoreLabel[i][j] = background_label
-    PredScoreWeight[i][j] = 0.
-2. Assigning PredScoreWeight based on `NegIndices`:
+2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided:
-Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
+Assumed that the row offset for each instance in `NegIndices` is called neg_lod,
-for i-th instance and all ids of NegIndices in this instance:
+for i-th instance and each `id` of NegIndices in this instance:
-    PredScoreLabel[i][id] = background_label
+    Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
-    PredScoreWeight[i][id] = 1.0
+    OutWeight[i][id] = 1.0
    )DOC");
  }
 };
-template <typename T>
+template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
+struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int num, const int num_prior_box,
+                  const size_t* lod, const int N, const int M, const int K,
-                  const int background_label, int* out_label, T* out_label_wt) {
+                  const int mismatch_value, T* out, WT* out_wt) {
-    for (int i = 0; i < num; ++i) {
+    for (int i = 0; i < N; ++i) {
      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
        int id = neg_indices[j];
-        out_label[i * num_prior_box + id] = background_label;
+        int off = (i * M + id) * K;
-        out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
+        for (int k = 0; k < K; ++k) {
+          out[off + k] = mismatch_value;
+          out_wt[off + k] = static_cast<WT>(1.0);
+        }
      }
    }
  }
 };
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
+                                       float>;
 }  // namespace operators
 }  // namespace paddle
@@ -198,5 +157,5 @@ REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
                             ops::TargetAssignOpMaker);
 REGISTER_OP_CPU_KERNEL(
    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
--- a/paddle/fluid/operators/target_assign_op.cu
+++ b/paddle/fluid/operators/target_assign_op.cu
@@ -17,39 +17,41 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename T>
+template <typename T, typename WT>
 __global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
-                                      const int num, const int num_prior_box,
+                                      const int N, const int M, const int K,
-                                      const int background_label,
+                                      const int mismatch_value, T* out,
-                                      int* out_label, T* out_label_wt) {
+                                      WT* out_wt) {
  int bidx = blockIdx.x;
  int st = lod[bidx];
  int ed = lod[bidx + 1];
-  int row_start = bidx * num_prior_box;
+  int row_start = bidx * M;
  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
    int id = row_start + neg_indices[i];
-    out_label[id] = background_label;
+    for (int k = 0; k < K; ++k) {
-    out_label_wt[id] = 1.;
+      out[id * K + k] = T(mismatch_value);
+      out_wt[id * K + k] = WT(1.);
+    }
  }
 }
-template <typename T>
+template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
  void operator()(const platform::CUDADeviceContext& ctx,
-                  const int* neg_indices, const size_t* lod, const int num,
+                  const int* neg_indices, const size_t* lod, const int N,
-                  const int num_prior_box, const int background_label,
+                  const int M, const int K, const int mismatch_value, T* out,
-                  int* out_label, T* out_label_wt) {
+                  WT* out_wt) {
    const int block_size = 256;
-    const int grid_size = num;
+    const int grid_size = N;
-    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
+    NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
-        neg_indices, lod, num, num_prior_box, background_label, out_label,
+        neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
-        out_label_wt);
  }
 };
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
+                                       float>;
 }  // namespace operators
 }  // namespace paddle
@@ -57,5 +59,5 @@ template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
--- a/paddle/fluid/operators/target_assign_op.h
+++ b/paddle/fluid/operators/target_assign_op.h
@@ -19,140 +19,113 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+template <typename T, typename WT>
-template <typename T>
 struct TargetAssignFunctor {
-  const T* gt_box_;
+  const T* in_;
-  const int* gt_label_;
  const int* match_indices_;
  const size_t* lod_;
-  const int background_label_;
+  const int mismatch_value_;
-  const int64_t num_;
+  const int64_t N_;
-  const int64_t num_prior_box_;
+  const int64_t M_;
+  const int64_t P_;
-  T* out_box_;
+  const int64_t K_;
-  T* out_box_wt_;
-  int* out_label_;
+  T* out_;
-  T* out_label_wt_;
+  WT* out_wt_;
-  TargetAssignFunctor(const T* gt_box, const int* gt_label,
+  TargetAssignFunctor(const T* input, const int* match_indices,
-                      const int* match_indices, const size_t* lod,
+                      const size_t* lod, const int mismatch_value,
-                      const int background_label, const int64_t num,
+                      const int64_t N, const int64_t M, const int64_t P,
-                      const int64_t np, T* out_box, T* out_box_wt,
+                      const int64_t K, T* out, WT* out_wt)
-                      int* out_label, T* out_label_wt)
+      : in_(input),
-      : gt_box_(gt_box),
-        gt_label_(gt_label),
        match_indices_(match_indices),
        lod_(lod),
-        background_label_(background_label),
+        mismatch_value_(mismatch_value),
-        num_(num),
+        N_(N),
-        num_prior_box_(np),
+        M_(M),
-        out_box_(out_box),
+        P_(P),
-        out_box_wt_(out_box_wt),
+        K_(K),
-        out_label_(out_label),
+        out_(out),
-        out_label_wt_(out_label_wt) {}
+        out_wt_(out_wt) {}
  HOSTDEVICE void operator()(size_t i) const {
-    int row = i / num_prior_box_;
+    int h = i / M_;
-    int col = i - row * num_prior_box_;
+    int w = i - h * M_;
-    size_t row_off = lod_[row];
+    size_t off = lod_[h];
-    int offset = row * num_prior_box_ + col;
+    int id = match_indices_[i];
-    int id = match_indices_[offset];
+    T* out = out_ + i * K_;
-    T* obox = out_box_ + offset * 4;
+    WT* out_wt = out_wt_ + i;
-    int* olabel = out_label_ + offset;
-    T* obox_wt = out_box_wt_ + offset;
-    T* olabel_wt = out_label_wt_ + offset;
    if (id > -1) {
-      const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
+      int w_off = w % P_;
+      const T* in = in_ + ((off + id) * P_ + w_off) * K_;
-      obox[0] = gtbox[0];
+      for (int64_t k = 0; k < K_; ++k) {
-      obox[1] = gtbox[1];
+        out[k] = in[k];
-      obox[2] = gtbox[2];
+      }
-      obox[3] = gtbox[3];
+      out_wt[0] = static_cast<WT>(1.);
-      olabel[0] = gt_label_[row_off + id];
-      obox_wt[0] = static_cast<T>(1.);
-      olabel_wt[0] = static_cast<T>(1.);
    } else {
-      obox[0] = static_cast<T>(0.);
+      for (int64_t k = 0; k < K_; ++k) {
-      obox[1] = static_cast<T>(0.);
+        out[k] = static_cast<T>(mismatch_value_);
-      obox[2] = static_cast<T>(0.);
+      }
-      obox[3] = static_cast<T>(0.);
+      out_wt[0] = static_cast<WT>(0.);
-      olabel[0] = background_label_;
-      obox_wt[0] = static_cast<T>(0.);
-      olabel_wt[0] = static_cast<T>(0.);
    }
  }
 };
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, typename WT>
 struct NegTargetAssignFunctor {
  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int num, const int num_prior_box,
+                  const size_t* lod, const int N, const int M, const int K,
-                  const int background_label, int* out_label,
+                  const int mismatch_value, T* out, WT* out_wt) const;
-                  T* out_label_wt) const;
 };
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, typename WT>
 class TargetAssignKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
-    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
-    auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
-    auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
-    auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
-    auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
-    PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
+    auto* out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
+    auto* out_wt = ctx.Output<framework::Tensor>("OutWeight");
-    PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
-    int background_label = ctx.Attr<int>("background_label");
+    PADDLE_ENFORCE_EQ(x->lod().size(), 1UL);
+    int mismatch_value = ctx.Attr<int>("mismatch_value");
-    const T* box_data = enc_gt_box->data<T>();
+    const T* x_data = x->data<T>();
-    const int* label_data = gt_label->data<int>();
    const int* match_idx_data = match_indices->data<int>();
-    const int* neg_idx_data = neg_indices->data<int>();
-    T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
+    WT* out_wt_data = out_wt->mutable_data<WT>(ctx.GetPlace());
-    int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
-    T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
-    int64_t num = match_indices->dims()[0];
+    int64_t n = match_indices->dims()[0];
-    int64_t num_prior_box = match_indices->dims()[1];
+    int64_t m = match_indices->dims()[1];
+    int64_t p = x->dims()[1];
+    int64_t k = x->dims()[2];
-    auto gt_lod = enc_gt_box->lod().back();
+    auto x_lod = x->lod().back();
-    auto gt_label_lod = gt_label->lod().back();
+    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
-    auto neg_lod = neg_indices->lod().back();
-    for (size_t i = 0; i < gt_lod.size(); ++i) {
-      PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
-    }
-    size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
-    size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
-    TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
+    TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
-                                   gt_lod_data, background_label, num,
+                                       mismatch_value, n, m, p, k, out_data,
-                                   num_prior_box, obox_data, obox_wt_data,
+                                       out_wt_data);
-                                   olabel_data, olabel_wt_data);
    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(device_ctx,
+    platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
-                                                num * num_prior_box);
    for_range(functor);
-    NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
+    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
-    neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
+    if (neg_indices) {
-                    background_label, olabel_data, olabel_wt_data);
+      PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
+      const int* neg_idx_data = neg_indices->data<int>();
+      auto neg_lod = neg_indices->lod().back();
+      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+      NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
+      neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
+                      mismatch_value, out_data, out_wt_data);
+    }
  }
 };

--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"
 #include <ostream>
 #include <sstream>

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -23,8 +23,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/string/printf.h"
+#include "paddle/fluid/string/printf.h"
-#include "paddle/string/to_string.h"
+#include "paddle/fluid/string/to_string.h"
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/string/piece.h"
+#include "paddle/fluid/string/piece.h"
 using StringPiece = paddle::string::Piece;
 using paddle::string::HasPrefix;

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -35,7 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/string/to_string.h"
+#include "paddle/fluid/string/to_string.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"

--- a/paddle/string/.clang-format
+++ b/paddle/string/.clang-format
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
--- a/paddle/string/piece.cc
+++ b/paddle/string/piece.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/string/piece.h"
+#include "piece.h"
 #include <string.h>

--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -28,7 +28,7 @@ namespace string {
 // its syntax is simple as it doesn't own/manage the string, it is
 // cheap to construct Pieces and pass them around.
 class Piece {
-public:
+ public:
  static const size_t npos = static_cast<size_t>(-1);
  // We provide non-explicit singleton constructors so users can
@@ -55,7 +55,7 @@ public:
  // Return a string that contains the copy of the referenced data.
  std::string ToString() const { return std::string(data_, size_); }
-private:
+ private:
  const char* data_;
  size_t size_;

--- a/paddle/string/piece_test.cc
+++ b/paddle/string/piece_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/string/piece.h"
+#include "paddle/fluid/string/piece.h"
 #include <sstream>

--- a/paddle/string/printf.h
+++ b/paddle/string/printf.h
@@ -71,7 +71,7 @@
 #include <iostream>
 #include <sstream>
-#include "paddle/string/tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
+#include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 namespace paddle {
 namespace string {

--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/string/printf.h"
+#include "printf.h"
 #include <string>
@@ -24,6 +24,6 @@ TEST(StringPrintf, StringPrintf) {
  long hour = 14;
  int min = 44;
  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
-            paddle::string::Sprintf(
+            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
-                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
+                                    hour, min));
 }
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -147,7 +147,7 @@ namespace detail {
 // Test whether type T1 is convertible to type T2
 template <typename T1, typename T2>
 struct is_convertible {
-private:
+ private:
  // two types of different size
  struct fail {
    char dummy[2];
@@ -160,7 +160,7 @@ private:
  static succeed tryConvert(const T2 &);
  static const T1 &makeT1();
-public:
+ public:
  // Standard trick: the (...) version of tryConvert will be chosen from
  // the overload set only if the version taking a T2 doesn't match.
  // Then we compare the sizes of the return types to check which
@@ -170,8 +170,7 @@ public:
 // Format the value by casting to type fmtT.  This default implementation
 // should never be called.
-template <typename T,
+template <typename T, typename fmtT,
-          typename fmtT,
          bool convertible = is_convertible<T, fmtT>::value>
 struct formatValueAsType {
  static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
@@ -241,11 +240,8 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
 /// operator<< to format the type T, with special cases for the %c and %p
 /// conversions.
 template <typename T>
-inline void formatValue(std::ostream &out,
+inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
-                        const char * /*fmtBegin*/,
+                        const char *fmtEnd, int ntrunc, const T &value) {
-                        const char *fmtEnd,
-                        int ntrunc,
-                        const T &value) {
  // The mess here is to support the %c and %p conversions: if these
  // conversions are active we try to convert the type to a char or const
  // void* respectively and format that instead of the value itself.  For the
@@ -267,25 +263,22 @@ inline void formatValue(std::ostream &out,
 }
 // Overloaded version for char types to support printing as an integer
-#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                      \
-  inline void formatValue(std::ostream &out,         \
+  inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,   \
-                          const char * /*fmtBegin*/, \
+                          const char *fmtEnd, int /**/, charType value) { \
-                          const char *fmtEnd,        \
+    switch (*(fmtEnd - 1)) {                                              \
-                          int /**/,                  \
+      case 'u':                                                           \
-                          charType value) {          \
+      case 'd':                                                           \
-    switch (*(fmtEnd - 1)) {                         \
+      case 'i':                                                           \
-      case 'u':                                      \
+      case 'o':                                                           \
-      case 'd':                                      \
+      case 'X':                                                           \
-      case 'i':                                      \
+      case 'x':                                                           \
-      case 'o':                                      \
+        out << static_cast<int>(value);                                   \
-      case 'X':                                      \
+        break;                                                            \
-      case 'x':                                      \
+      default:                                                            \
-        out << static_cast<int>(value);              \
+        out << value;                                                     \
-        break;                                       \
+        break;                                                            \
-      default:                                       \
+    }                                                                     \
-        out << value;                                \
-        break;                                       \
-    }                                                \
  }
 // per 3.9.1: char, signed char and unsigned char are all distinct types
 TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
@@ -482,7 +475,7 @@ namespace detail {
 // each argument to be allocated as a homogenous array inside FormatList
 // whereas a naive implementation based on inheritance does not.
 class FormatArg {
-public:
+ public:
  FormatArg() {}
  template <typename T>
@@ -491,22 +484,17 @@ public:
        m_formatImpl(&formatImpl<T>),
        m_toIntImpl(&toIntImpl<T>) {}
-  void format(std::ostream &out,
+  void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
-              const char *fmtBegin,
-              const char *fmtEnd,
              int ntrunc) const {
    m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
  }
  int toInt() const { return m_toIntImpl(m_value); }
-private:
+ private:
  template <typename T>
-  static void formatImpl(std::ostream &out,
+  static void formatImpl(std::ostream &out, const char *fmtBegin,
-                         const char *fmtBegin,
+                         const char *fmtEnd, int ntrunc, const void *value) {
-                         const char *fmtEnd,
-                         int ntrunc,
-                         const void *value) {
    formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
  }
@@ -516,11 +504,8 @@ private:
  }
  const void *m_value;
-  void (*m_formatImpl)(std::ostream &out,
+  void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
-                       const char *fmtBegin,
+                       const char *fmtEnd, int ntrunc, const void *value);
-                       const char *fmtEnd,
-                       int ntrunc,
-                       const void *value);
  int (*m_toIntImpl)(const void *value);
 };
@@ -569,12 +554,10 @@ inline const char *printFormatStringLiteral(std::ostream &out,
 // necessary to pull out variable width and precision .  The function returns a
 // pointer to the character after the end of the current format spec.
 inline const char *streamStateFromFormat(std::ostream &out,
-                                         bool &spacePadPositive,
+                                         bool &spacePadPositive, int &ntrunc,
-                                         int &ntrunc,
                                         const char *fmtStart,
                                         const detail::FormatArg *formatters,
-                                         int &argIndex,
+                                         int &argIndex, int numFormatters) {
-                                         int numFormatters) {
  if (*fmtStart != '%') {
    TINYFORMAT_ERROR(
        "tinyformat: Not enough conversion specifiers in format string");
@@ -750,10 +733,8 @@ inline const char *streamStateFromFormat(std::ostream &out,
 }
 //------------------------------------------------------------------------------
-inline void formatImpl(std::ostream &out,
+inline void formatImpl(std::ostream &out, const char *fmt,
-                       const char *fmt,
+                       const detail::FormatArg *formatters, int numFormatters) {
-                       const detail::FormatArg *formatters,
-                       int numFormatters) {
  // Saved stream state
  std::streamsize origWidth = out.width();
  std::streamsize origPrecision = out.precision();
@@ -765,13 +746,9 @@ inline void formatImpl(std::ostream &out,
    fmt = printFormatStringLiteral(out, fmt);
    bool spacePadPositive = false;
    int ntrunc = -1;
-    const char *fmtEnd = streamStateFromFormat(out,
+    const char *fmtEnd =
-                                               spacePadPositive,
+        streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
-                                               ntrunc,
+                              argIndex, numFormatters);
-                                               fmt,
-                                               formatters,
-                                               argIndex,
-                                               numFormatters);
    if (argIndex >= numFormatters) {
      // Check args remain after reading any variable width/precision
      TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
@@ -820,15 +797,14 @@ inline void formatImpl(std::ostream &out,
 /// information has been stripped from the arguments, leaving just enough of a
 /// common interface to perform formatting as required.
 class FormatList {
-public:
+ public:
  FormatList(detail::FormatArg *formatters, int N)
      : m_formatters(formatters), m_N(N) {}
-  friend void vformat(std::ostream &out,
+  friend void vformat(std::ostream &out, const char *fmt,
-                      const char *fmt,
                      const FormatList &list);
-private:
+ private:
  const detail::FormatArg *m_formatters;
  int m_N;
 };
@@ -841,7 +817,7 @@ namespace detail {
 // Format list subclass with fixed storage to avoid dynamic allocation
 template <int N>
 class FormatListN : public FormatList {
-public:
+ public:
  template <typename... Args>
  FormatListN(const Args &... args)
      : FormatList(&m_formatterStore[0], N),
@@ -849,14 +825,14 @@ public:
    static_assert(sizeof...(args) == N, "Number of args must be N");
  }
-private:
+ private:
  FormatArg m_formatterStore[N];
 };
 // Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
 template <>
 class FormatListN<0> : public FormatList {
-public:
+ public:
  FormatListN() : FormatList(0, 0) {}
 };

--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/string/to_string.h"
+#include "to_string.h"
 #include <gtest/gtest.h>
 constexpr char kOutputString[] = "User Defined Output";
 class UserDefinedClass {
-public:
+ public:
 };
 std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -115,8 +115,8 @@ EOF
            -DWITH_AVX=${WITH_AVX:-ON} \
            -DWITH_SWIG_PY=ON \
            -DWITH_STYLE_CHECK=OFF
-        make -j `nproc` gen_proto_py
+        make -j `nproc` gen_proto_py framework_py_proto
-        make -j `nproc` paddle_python
+        make -j `nproc` copy_paddle_pybind
        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
        popd
    fi

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -6,9 +6,9 @@ mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
-make -j `nproc` gen_proto_py
+make -j `nproc` gen_proto_py framework_py_proto
-make -j `nproc` paddle_python
+make -j `nproc` copy_paddle_pybind
 make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
 # check websites for broken links

--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -33,6 +33,57 @@ class VarBlock:
        return "%s:%d:%d" % (self.varname, self.offset, self.size)
+class UnionFind(object):
+    """ Union-find data struct.
+    Union-find is a data struct that keeps track of a set of elements partitioned
+    into a number of disjoint (non-overlapping) subsets.
+    Reference:
+    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+    Args:
+      elements(list): The initialize element list.
+    """
+    def __init__(self, elementes=None):
+        self._parents = []  # index -> parent index
+        self._index = {}  # element -> index
+        self._curr_idx = 0
+        if not elementes:
+            elementes = []
+        for ele in elementes:
+            self._parents.append(self._curr_idx)
+            self._index.update({ele: self._curr_idx})
+            self._curr_idx += 1
+    def find(self, x):
+        # Find the root index of given element x,
+        # execute the path compress while findind the root index
+        if not x in self._index:
+            return -1
+        idx = self._index[x]
+        while idx != self._parents[idx]:
+            t = self._parents[idx]
+            self._parents[idx] = self._parents[t]
+            idx = t
+        return idx
+    def union(self, x, y):
+        # Union two given element
+        x_root = self.find(x)
+        y_root = self.find(y)
+        if x_root == y_root:
+            return
+        self._parents[x_root] = y_root
+    def is_connected(self, x, y):
+        # If two given elements have the same root index,
+        # then they are connected.
+        return self.find(x) == self.find(y)
 def same_or_split_var(p_name, var_name):
    return p_name == var_name or p_name.startswith(var_name + ".block")
@@ -203,6 +254,21 @@ class DistributeTranspiler:
                    (varname, self.trainer_id)
                startup_prog.global_block().rename_var(varname, new_var_name)
+    #     self.lr_param_mapping = self._create_lr_param_mapping()
+    # def _create_lr_param_mapping(self):
+    #     lr_mapping = dict()
+    #     for _, opt_op in enumerate(self.optimize_ops):
+    #         if not opt_op.inputs or not opt_op.inputs.has_key("LearningRate") \
+    #           or not opt_op.inputs.has_key("Param"):
+    #             continue
+    #         lr = opt_op.inputs["LearningRate"].name
+    #         param = opt_op.inputs["Param"].name
+    #         if not lr_mapping.has_key(lr):
+    #             lr_mapping.update({lr: list()})
+    #         lr_mapping[lr].append(param)
+    #     return lr_mapping
    def _create_vars_from_blocklist(self, program, block_list):
        # Create respective variables using the block_list
        block_map = dict()
@@ -333,50 +399,15 @@ class DistributeTranspiler:
            pass
        return orig_shape
-    def _op_input_var(self, op, varname):
+    def _fetch_var_names(self, param_dict):
-        pass
+        res = []
+        if not param_dict:
-    def _is_op_on_pserver(self, endpoint, all_ops, idx):
+            return res
-        """
+        for _, values in param_dict.iteritems():
-        Recursively check if the op need to run on current server.
+            if not isinstance(values, list):
-        Assume that ops are in the execution order.
+                values = [values]
-        """
+            res += [v.name for v in values]
-        param_names = [
+        return res
-            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
-        ]
-        op = all_ops[idx]
-        input_names = set(op.input_names)
-        # TODO(typhoonzero): using Param and Grad input name to identify
-        # that the operator is an optimization operator, need a better way.
-        if "Param" in input_names:
-            if op.input("Param")[0] in param_names:
-                return True
-            else:
-                for n in param_names:
-                    if same_or_split_var(n, op.input("Param")[0]) \
-                            and n != op.input("Param")[0]:
-                        return True
-                return False
-        else:
-            j = idx - 1
-            while j >= 0:
-                prev_op = all_ops[j]
-                # NOTE(typhoonzero): consider list input/output
-                prev_output_names = prev_op.desc.output_arg_names()
-                prev_input_names = prev_op.desc.input_arg_names()
-                found1 = False
-                found2 = False
-                for varname in op.desc.input_arg_names():
-                    if varname in prev_output_names:
-                        found1 = self._is_op_on_pserver(endpoint, all_ops, j)
-                # later ops may produce output for prev op's next batch use.
-                for varname in op.desc.output_arg_names():
-                    if varname in prev_input_names:
-                        found2 = self._is_op_on_pserver(endpoint, all_ops, j)
-                if found1 or found2:
-                    return True
-                j -= 1
-            return False
    def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
        program = optimize_block.program
@@ -394,11 +425,7 @@ class DistributeTranspiler:
                    # do not append this op if current endpoint
                    # is not dealing with this grad block
                    return
-                merged_var = program.global_block().create_var(
+                merged_var = program.global_block().vars[grad_block.name]
-                    name=grad_block.name,
-                    persistable=grad_block.persistable,
-                    dtype=grad_block.dtype,
-                    shape=grad_block.shape)
                # append merging ops if trainers > 1
                if self.trainers > 1:
                    vars2merge = self._create_var_for_trainers(
@@ -429,13 +456,19 @@ class DistributeTranspiler:
                    shape=param_block.shape)
                new_inputs[key] = tmpvar
+            elif key == "LearningRate":
+                # leraning rate variable has already be created by non-optimize op,
+                # don't create it once again.
+                new_inputs[key] = program.global_block().vars[opt_op.input(key)[
+                    0]]
        for key in opt_op.input_names:
-            if key in ["Param", "Grad"]:
+            new_shape = None
+            if key in ["Param", "Grad", "LearningRate"]:
                continue
+            var = program.global_block().vars[opt_op.input(key)[0]]
            # update accumulator variable shape
            param_shape = new_inputs["Param"].shape
-            var = program.global_block().vars[opt_op.input(key)[0]]
            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                        var.shape, param_shape)
            tmpvar = program.global_block().create_var(
@@ -446,12 +479,11 @@ class DistributeTranspiler:
            new_inputs[key] = tmpvar
        # change output's ParamOut variable
-        outputs = self._get_output_map_from_op(program.global_block(), opt_op)
+        opt_op.outputs["ParamOut"] = new_inputs["Param"]
-        outputs["ParamOut"] = new_inputs["Param"]
        optimize_block.append_op(
            type=opt_op.type,
            inputs=new_inputs,
-            outputs=outputs,
+            outputs=opt_op.outputs,
            attrs=opt_op.attrs)
    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
@@ -459,11 +491,10 @@ class DistributeTranspiler:
        # Append the ops for parameters that do not need to be optimized/updated
        inputs = self._get_input_map_from_op(self.program.global_block().vars,
                                             opt_op)
-        for var in inputs.itervalues():
+        for varlist in inputs.itervalues():
-            if type(var) == list:
+            if not isinstance(varlist, list):
-                varlist = var
+                varlist = [varlist]
-            else:
-                varlist = [var]
            for var in varlist:
                if not program.global_block().vars.has_key(var.name):
                    program.global_block().create_var(
@@ -475,12 +506,70 @@ class DistributeTranspiler:
        outputs = self._get_output_map_from_op(self.program.global_block().vars,
                                               opt_op)
+        for varlist in outputs.itervalues():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                program.global_block().create_var(
+                    name=var.name,
+                    persistable=var.persistable,
+                    dtype=var.dtype,
+                    shape=var.shape)
        optimize_block.append_op(
            type=opt_op.type,
            inputs=inputs,
            outputs=outputs,
            attrs=opt_op.attrs)
+    def _is_op_connected(self, op1, op2):
+        # If one op's input is another op's output or
+        # one op's output is another op's input, we say
+        # the two operator is connected.
+        op1_input_names = self._fetch_var_names(op1.inputs)
+        op1_output_names = self._fetch_var_names(op1.outputs)
+        op2_input_names = self._fetch_var_names(op2.inputs)
+        op2_output_names = self._fetch_var_names(op2.outputs)
+        if set(op1_output_names) & set(op2_input_names) or \
+           set(op1_input_names) & set(op2_output_names):
+            return True
+        return False
+    def _create_ufind(self, optimize_ops):
+        # Create a unit find data struct by optimize ops
+        ufind = UnionFind(optimize_ops)
+        for i in xrange(len(optimize_ops)):
+            for j in xrange(i, len(optimize_ops)):
+                op1 = optimize_ops[i]
+                op2 = optimize_ops[j]
+                if self._is_op_connected(op1, op2):
+                    ufind.union(op1, op2)
+        return ufind
+    def _is_opt_op(self, op):
+        # NOTE: It's a HACK implement.
+        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
+        if "Param" in op.input_names and \
+            "LearningRate" in op.input_names:
+            return True
+        return False
+    def _is_opt_op_on_pserver(self, endpoint, op):
+        param_names = [
+            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
+        ]
+        if op.input("Param") in param_names:
+            return True
+        else:
+            for n in param_names:
+                param = op.input("Param")
+                if same_or_split_var(n, param) and n != param:
+                    return True
+            return False
+        return False
    def get_pserver_program(self, endpoint):
        """
        Get pserver side program using the endpoint
@@ -514,17 +603,30 @@ class DistributeTranspiler:
                recv_inputs.append(var)
        # step6
        optimize_block = pserver_program.create_block(0)
-        # Iterate through the ops and append ops as needed
+        # step 6.1
-        for idx, opt_op in enumerate(self.optimize_ops):
+        # Create a union-find data struct by optimize ops,
-            is_op_on_pserver = self._is_op_on_pserver(endpoint,
+        # If two ops are connected, we could add these two ops
-                                                      self.optimize_ops, idx)
+        # into one set.
-            if not is_op_on_pserver:
+        ufind = self._create_ufind(self.optimize_ops)
-                continue
+        # step 6.2 
-            if "Grad" in opt_op.desc.input_arg_names():
+        # Iterate through the ops and append optimize op which
-                self._append_pserver_ops(optimize_block, opt_op, endpoint)
+        # located on current pserver
-            else:
+        opt_op_on_pserver = []
-                self._append_pserver_non_opt_ops(optimize_block, opt_op)
+        for _, op in enumerate(self.optimize_ops):
+            if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
+                opt_op_on_pserver.append(op)
+        # step 6.3
+        # Iterate through the ops, and if an op and the optimize ops
+        # which located on current pserver are in one set, then 
+        # append it into the sub program.
+        for _, op in enumerate(self.optimize_ops):
+            for _, opt_op in enumerate(opt_op_on_pserver):
+                if ufind.is_connected(op, opt_op):
+                    if self._is_opt_op(op):
+                        self._append_pserver_ops(optimize_block, op, endpoint)
+                    else:
+                        self._append_pserver_non_opt_ops(optimize_block, op)
+                    break
        # Append the listen_and_serv op
        pserver_program.global_block().append_op(
            type="listen_and_serv",

--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -117,6 +117,7 @@ def monkey_patch_variable():
            tmp_name = unique_tmp_name()
            out = self.block.create_var(name=tmp_name, dtype=lhs_dtype)
            self.block.append_op(
                type=op_type,
                inputs={'X': [self],

--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
@@ -99,7 +99,7 @@ elif training_role == "TRAINER":
    exe.run(fluid.default_startup_program())
    for pass_id in range(PASS_NUM):
        for data in train_reader():
-            avg_cost_np = exe.run(fluid.default_main_program(),
+            avg_cost_np = exe.run(t.get_trainer_program(),
                                  feed=feeder.feed(data),
                                  fetch_list=[avg_cost])
            print("avg_cost_np", avg_cost_np)

--- a/python/paddle/v2/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
@@ -64,9 +64,7 @@ exe = fluid.Executor(place)
 [res1, res2] = exe.run(prog, fetch_list=[out1, out2])
-test_pass = res1.shape == (10, 2) and res2.shape == (10, 1)
+if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
-if not test_pass:
    exit(1)
 exit(0)
--- a/python/paddle/v2/fluid/tests/test_sequence_expand.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_expand.py
@@ -73,5 +73,20 @@ class TestSequenceExpandCase3(TestSequenceExpand):
        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+class TestSequenceExpandCase4(TestSequenceExpand):
+    def set_data(self):
+        x_data = np.array(
+            [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
+                [2, 5]).astype('float32')
+        x_lod = [[
+            0,
+            1,
+            2,
+        ]]
+        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
+        y_lod = [[0, 1, 2], [0, 1, 2]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_split_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_op.py
@@ -20,11 +20,11 @@ from op_test import OpTest
 class TestSplitOp(OpTest):
    def setUp(self):
        self.op_type = "split"
-        axis = 0
+        axis = 1
-        x = np.random.random((4, 2, 5)).astype('float32')
+        x = np.random.random((4, 5, 6)).astype('float32')
-        out = np.split(x, [1, 3], axis)
+        out = np.split(x, [2, 3], axis)
        self.inputs = {'X': x}
-        self.attrs = {'axis': axis, 'sections': [1, 2, 1]}
+        self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
        self.outputs = {'Out': [('out%d' % i, out[i]) \
            for i in xrange(len(out))]}

--- a/python/paddle/v2/fluid/tests/test_target_assign_op.py
+++ b/python/paddle/v2/fluid/tests/test_target_assign_op.py
@@ -43,7 +43,7 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
 def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
-                  neg_lod, background_label):
+                  neg_lod, mismatch_value):
    batch_size, num_prior = match_indices.shape
    # init target bbox
@@ -52,7 +52,7 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
    trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
    # init target label
    trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
-    trg_label = trg_label * background_label
+    trg_label = trg_label * mismatch_value
    # init weight for target label
    trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
@@ -65,53 +65,90 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
        # target bbox
        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
            trg_box[i][c][:] = encoded_box[v][c][:]
        # weight for target bbox
        trg_box_wt[i][col_ids] = 1.0
        trg_label[i][col_ids] = gt_label[col_val + gt_start]
        trg_label_wt[i][col_ids] = 1.0
        # set target label weight to 1.0 for the negative samples
-        neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+        if neg_indices is not None:
-        trg_label_wt[i][neg_ids] = 1.0
+            neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+            trg_label_wt[i][neg_ids] = 1.0
    return trg_box, trg_box_wt, trg_label, trg_label_wt
-class TestTargetAssginOp(OpTest):
+class TestTargetAssginFloatType(OpTest):
    def setUp(self):
        self.op_type = "target_assign"
+        num_prior = 120
+        num_class = 21
+        gt_lod = [0, 5, 11, 23]
+        neg_lod = [0, 4, 7, 13]
+        mismatch_value = 0
+        batch_size = len(gt_lod) - 1
+        num_gt = gt_lod[-1]
+        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
+        gt_label = np.random.randint(
+            num_class, size=(num_gt, 1)).astype('int32')
+        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
+                                                               gt_lod, neg_lod)
+        out, out_wt, _, _ = target_assign(encoded_box, gt_label, match_indices,
+                                          neg_indices, gt_lod, neg_lod,
+                                          mismatch_value)
+        # assign regression targets
+        x = encoded_box
+        self.inputs = {
+            'X': (x, [gt_lod]),
+            'MatchIndices': match_indices,
+        }
+        self.attrs = {'mismatch_value': mismatch_value}
+        self.outputs = {
+            'Out': out,
+            'OutWeight': out_wt,
+        }
+    def test_check_output(self):
+        self.check_output()
+class TestTargetAssginIntType(OpTest):
+    def setUp(self):
+        self.op_type = "target_assign"
        num_prior = 120
        num_class = 21
        gt_lod = [0, 5, 11, 23]
        neg_lod = [0, 4, 7, 13]
+        mismatch_value = 0
        batch_size = len(gt_lod) - 1
        num_gt = gt_lod[-1]
-        background_label = 0
        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
        gt_label = np.random.randint(
            num_class, size=(num_gt, 1)).astype('int32')
        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
                                                               gt_lod, neg_lod)
-        trg_box, trg_box_wt, trg_label, trg_label_wt = target_assign(
-            encoded_box, gt_label, match_indices, neg_indices, gt_lod, neg_lod,
-            background_label)
+        _, _, out, out_wt, = target_assign(encoded_box, gt_label, match_indices,
+                                           neg_indices, gt_lod, neg_lod,
+                                           mismatch_value)
+        # assign cassification argets
+        x = np.reshape(gt_label, (num_gt, 1, 1))
        self.inputs = {
-            'EncodedGTBBox': (encoded_box, [gt_lod]),
+            'X': (x, [gt_lod]),
-            'GTScoreLabel': (gt_label, [gt_lod]),
+            'MatchIndices': match_indices,
-            'MatchIndices': (match_indices),
            'NegIndices': (neg_indices, [neg_lod]),
        }
-        self.attrs = {'background_label': background_label}
+        self.attrs = {'mismatch_value': mismatch_value}
        self.outputs = {
-            'PredBBoxLabel': (trg_box),
+            'Out': out,
-            'PredBBoxWeight': (trg_box_wt),
+            'OutWeight': out_wt,
-            'PredScoreLabel': (trg_label),
-            'PredScoreWeight': (trg_label_wt),
        }
    def test_check_output(self):