diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 7d53554358497762b1cd91c39bdd23c5807af2bc..df186637726f60ee1b69cec7291477f3efcd059c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -16,12 +16,10 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
-        if(IS_DIRECTORY ${src})
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
-        else()
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
-        endif()
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND mkdir -p "${dst}"
+          COMMAND cp -r "${src}" "${dst}"
+          COMMENT "copying ${src} -> ${dst}")
     endforeach()
 endfunction()
 
@@ -53,11 +51,11 @@ IF(NOT PROTOBUF_FOUND)
 ENDIF(NOT PROTOBUF_FOUND)
 
 # paddle fluid module
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
 
@@ -69,7 +67,7 @@ copy(memory_lib
 
 set(module "inference")
 copy(inference_lib DEPENDS paddle_fluid_shared
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index bfbb2cfc2c57c705cf42c65825edcc6dea08cf41..2746168f1dda493368b81820bde2f093d06d7b4e 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -25,7 +25,10 @@ namespace framework {
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -44,7 +47,10 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 61529fe38b15fe2a4bfa0d64159994d6b62fb086..8effbf1bc6298bdcc381e2176411a79da134653f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -64,6 +64,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 }
 
+void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  if (platform::is_gpu_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("Cannot run operator on place %s", place);
+#else
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
+#endif
+  }
+  RunImpl(scope, place);
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -479,8 +491,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
-void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::Place& place) const {
+void OperatorWithKernel::RunImpl(const Scope& scope,
+                                 const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 52300abeb7df346d610d2363335dc9d3330ee39e..708f87dc8632ac500e1050122c5fd5412071fd22 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -89,8 +89,9 @@ class OperatorBase {
 
   std::string DebugString() const { return DebugStringEx(nullptr); }
 
-  /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+  /// Net will call this interface function to Run an op.
+  //  The implementation should be written at RunImpl
+  void Run(const Scope& scope, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
@@ -144,6 +145,8 @@ class OperatorBase {
  private:
   void GenerateTemporaryNames();
   void CheckAllInputOutputSet() const;
+  virtual void RunImpl(const Scope& scope,
+                       const platform::Place& place) const = 0;
 };
 
 // Macro for define a clone method.
@@ -168,10 +171,13 @@ class OperatorBase {
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
   std::unique_ptr<OperatorBase> Clone() const override {
     return std::unique_ptr<OperatorBase>(new NOP(*this));
   }
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 
 class ExecutionContext {
@@ -363,8 +369,6 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const Scope& scope, const platform::Place& place) const final;
-
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
     static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
@@ -393,6 +397,7 @@ class OperatorWithKernel : public OperatorBase {
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
   proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index b90f5538bb620275521cdc11bf47b4014b2a66e2..0732ec5afe8738313e1d73c52c5303a2e8b1e96a 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -28,7 +28,10 @@ class OpWithoutKernelTest : public OperatorBase {
   OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                       const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope, const platform::Place& place) const override {
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {
     ++op_run_num;
     ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
     ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -259,8 +262,10 @@ class OperatorClone : public paddle::framework::OperatorBase {
                 const paddle::framework::VariableNameMap& outputs,
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::Place& place) const override {}
+
+ private:
+  void RunImpl(const paddle::framework::Scope& scope,
+               const paddle::platform::Place& place) const override {}
 };
 
 TEST(Operator, Clone) {
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index bf8e11bd8c047275fe341ead9424d02e98d5d8f4..69464c4cff52400d8a25a692c5df8d2fe06230e4 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -31,8 +31,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                      const framework::VariableNameMap &outputs,
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index f99f9af4276c0e8928f821ae166d55aed02e8e27..b72e72b12f8a6155b6eb3be1468b8dbc7bd48d4e 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -71,8 +71,10 @@ class AssignOp : public framework::OperatorBase {
            const framework::VariableNameMap &outputs,
            const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) {
       return;
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 7737d4e098ac9a0e56e1db2aee796550e8d71ba3..6d3efcfeb8497a78d56180898e5e3a66e52ff22d 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -55,8 +55,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                      const framework::VariableNameMap& outputs,
                      const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& dev_ctx = *pool.Get(dev_place);
 
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 9e2a05a60c30e388093aceddd40e58273364c8f9..bfbe78097d2f20ae4c5efa594d17f931c7ea5920 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -204,8 +204,9 @@ class BeamSearchOp : public framework::OperatorBase {
     PADDLE_THROW("Not Implemented");
   }
 
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
     auto pre_ids_var = scope.FindVar(Input("pre_ids"));
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 878e53058567500aeb9fe854a1a65ed5380572a8..c8a4292932dfaddb4ea73a0d1c8ff6bda02ce1c0 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -38,7 +38,7 @@ class ConcatKernel : public framework::OpKernel<T> {
       auto in_stride = framework::stride_numel(in->dims());
       StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
                                   out->data<T>() + output_offset, out_stride,
-                                  in->data<T>(), in_stride);
+                                  in->data<T>(), in_stride, in_stride[axis]);
       output_offset += in_stride[axis];
     }
   }
@@ -59,7 +59,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
       auto out_stride = framework::stride_numel(out->dims());
       StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
                                   out_stride, in->data<T>() + input_offset,
-                                  in_stride);
+                                  in_stride, out_stride[axis]);
       input_offset += out_stride[axis];
     }
   }
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
index dd93790d5b52a2ccc8358a94f7ead346d384f191..d63748a61cec0f10269e05bcef3bb0d10345000d 100644
--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
@@ -193,7 +193,7 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
   }
 }
 
-void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
   // get device context from pool
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& dev_ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
index 695af4490696b29d2d47f5825ebc0159b39663c0..0bb14bc8c2cfabeeb13e1e1afd51b034742b74f0 100644
--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
@@ -77,8 +77,9 @@ class CondOp : public framework::OperatorBase {
     sub_net_op_[FALSE_BRANCH] = std::move(net);
   }
 
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override;
 
  private:
   const int TRUE_BRANCH = 0;
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 30435c6cca0a4fb1d41dce47b8fefeafb6c48a51..228b0998360550348fdd30c842a394e8f8ce5935 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -65,8 +65,10 @@ class ConditionalBlockOp : public ConditionalOp {
                      const framework::VariableNameMap &outputs,
                      const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
 
     bool need_run;
@@ -128,8 +130,10 @@ class ConditionalBlockGradOp : public ConditionalOp {
                          const framework::VariableNameMap &outputs,
                          const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
 
     bool need_run;
diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc
index d1ba51f2c0f13a1b6e4d7ccb93c912703a0b1d86..1393f1a66baaf3b53f797aa61fd42ac3cf54f8db 100644
--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
@@ -106,8 +106,10 @@ template <typename T>
 class CreateRandomDataGeneratorOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
@@ -155,8 +157,10 @@ class CreateRandomDataGeneratorOpMaker
 class CreateShuffleReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
@@ -187,8 +191,10 @@ class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
 class CreateBatchReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index 0b3f5f0d1d09a932e15936285f5cb226daa86e95..41fa69a0972ef8ad528f2a04b0260c40155ffd3e 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -24,8 +24,10 @@ class FeedOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 54e5892016cdb01f50189147a7453b868c5a48c0..6cb5565013dcacac33e828386f1ea8909e831c1a 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -26,8 +26,9 @@ class FetchOp : public framework::OperatorBase {
           const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index d4bf6406e5716a6b65a234d1cd642b64dcc5726f..6dd58d28db23ff3de8a27e898a9b539787d08718 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -33,8 +33,10 @@ class FillConstantInferShape : public framework::InferShapeBase {
 class FillConstantOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto data_type =
         static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 8e318f37cf0bc945597b5aa7b384e53038c97786..0b97c9c2827ac1be4e99c647dbedc2d9b8730e41 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -42,8 +42,10 @@ class FillOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &out =
         detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                 "Cannot find variable %s", Output("Out"))
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index ba908e472bbc165a244d8543713f1dbf293abb48..ef635048bd4faa2dc0067152f5f7472acbfe47af 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -37,8 +37,10 @@ class GetPlacesOp : public framework::OperatorBase {
               const framework::VariableNameMap &outputs,
               const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     bool is_gpu;
     if (Attr<std::string>("device_type") == "AUTO") {
       is_gpu = platform::is_gpu_place(place);
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 3d488067b254c37515c6bdb9a4589aad311f344f..de4949584b7b20bec7b31f2ad1b69053ee9ffc0f 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -51,8 +51,9 @@ class IncrementOp : public framework::OperatorBase {
               const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index ea424018d66dac85d5a4ad75cbf5199064d52848..dac8505e3f2cb33b35b6184184e4762078a19c49 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -28,8 +28,9 @@ class IsEmptyOp : public framework::OperatorBase {
             const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     // get input
     auto *var = scope.FindVar(Input(kInput));
     PADDLE_ENFORCE_NOT_NULL(var);
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 1948063d886b79964b1a52d9d82a8e7d2fb0d493..d043702ebae627951927f2dbec893d40f77f0c73 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -26,8 +26,10 @@ class LoadCombineOp : public framework::OperatorBase {
                 const framework::VariableNameMap &outputs,
                 const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
 
     std::ifstream fin(filename);
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index c9bf5d72b234f96d9eb5a4c275737ac8c18cd63d..9393cccfc66ec930db6ef68bd6f3c5065ceea80e 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -25,8 +25,10 @@ class LoadOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
index f11f5a89f5ad5b2f3deed905625aefa1e9d9935b..daa57c20450f1f92cb0bb500e37d0d8c49c05758 100644
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -25,8 +25,10 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 0b9426a9f8f0b0b3082667dc7a1414aceb824aca..3264766d6b693244f8dbfa6462b9c7aa13d5b5ec 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -23,8 +23,10 @@ class LoDRankTableOp : public framework::OperatorBase {
                  const framework::VariableNameMap &outputs,
                  const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index edc32bcec1441e50e24612789727db9a044cde54..d6e24dc976a1ebe2afa182618d09839b105381c1 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -32,8 +32,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                      const framework::VariableNameMap &outputs,
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                           Input("X"))
                   .Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index eff8b927e52c94a4e19bb10c644cbaa34a7a0581..cef0dc307dbe97473e9041f51c25eca7cc9a0f1a 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -27,8 +27,9 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
     auto *out =
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 255f55334093213df867852e4d222f0e227e8c5d..88e67b6b86a3731cc2caf5529aa4892c6d605a86 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -27,8 +27,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
index 73a6c0b679310ac4108a915836b5ed497853b38b..540cf867418ec4378e5b97a343b9dcc85604f50c 100644
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -237,6 +237,8 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
     }
 
     ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+    // The first dimension of NegIndices will be set correcttly in Compute.
+    ctx->SetOutputDim("NegIndices", {-1, 1});
   }
 
  protected:
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 52420ceba0de0323dae000aa301ce7838b3311b6..703e8dd00fc8e613344db11065d6a45afa2a0cc8 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -26,8 +26,9 @@ class NCCLInitOp : public framework::OperatorBase {
              const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
index 14e5909851c4ac08b5f59c5c193c801827b91234..479ba386a70adaff09ae31e24c449fc18a9853b1 100644
--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
@@ -57,20 +57,6 @@ class NetOp : public framework::OperatorBase {
     this->CompleteAddOp();
   }
 
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
   bool SupportGPU() const override {
     for (auto& op : ops_) {
       if (!op->SupportGPU()) {
@@ -117,6 +103,20 @@ class NetOp : public framework::OperatorBase {
   std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
 
  private:
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, place);
+    }
+  }
+
   bool add_op_done_{false};
   std::set<std::string> intermediate_outputs_;
 
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
index cc20be0c81763abe2adcf09de858ce51e16d77a6..265f15e82ed29824ed65917dbe45e5edf9dc8993 100644
--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
@@ -26,7 +26,10 @@ class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope, const platform::Place& place) const override {
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {
     ++run_cnt;
   }
 };
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index e25df92479943d210d98f02374f377f778f43d2c..d791d11172869d42b08c059b900e729bcc9b5d96 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -118,8 +118,9 @@ class ParallelDoOp : public framework::OperatorBase {
                const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
@@ -207,8 +208,9 @@ class ParallelDoGradOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
     auto *program = block->Program();
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 3616545309e8c279f61a22e571a5e71335c47f93..4d12fdbb6b62d1d7095d10aa6f33d12598a8e99e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -130,8 +130,9 @@ class TensorPrintOp : public framework::OperatorBase {
     PADDLE_THROW("Not implemented.");
   }
 
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
     const framework::Variable* in_var_ptr = nullptr;
     std::string phase = kForward;
     std::string printed_var_name = "";
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 4d562c291911f54c9d1e8fed2e84035808bffbb7..127df82ff13b89de42e45113a21d6f5e7c2f20ed 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -54,8 +54,10 @@ class ReadInferVarType : public framework::VarTypeInference {
 class ReadOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     framework::ReaderHolder* reader =
         scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
     if (!reader->HasNext()) {
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index e4b9b8dab9b0394752d538aa5f59be3c06d0188f..33a744a5b7fef5802569a305d18746f04ed88136 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -226,8 +226,9 @@ class RecurrentOp : public RecurrentBase {
               const framework::AttributeMap &attrs)
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
@@ -315,8 +316,9 @@ class RecurrentGradOp : public RecurrentBase {
                   const framework::AttributeMap &attrs)
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 148a65bb4b7fe599f2fdb833c179665e58fe1c41..79ba9e543b892d051995d4bafb0ceaaf09838cd2 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -75,8 +75,10 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
                                   const framework::VariableNameMap &outputs,
                                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x =
         detail::Ref(scope.FindVar(Input("X")),
                     "Cannot find input lod tensor variable %s", Input("X"))
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 504456c4b069f81319893ae51f57503f5025761a..e9329a0e7e279e2bdd3c45986580c87aa5d0b1fe 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -24,8 +24,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                     const framework::VariableNameMap &outputs,
                     const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto mem_var_name = Input("X");
     auto *mem_var = scope.FindVar(mem_var_name);
     PADDLE_ENFORCE(mem_var != nullptr,
@@ -76,8 +78,10 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                         const framework::VariableNameMap &outputs,
                         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto out_grad_var_name = Input(framework::GradVarName("Out"));
     auto *out_grad_var = scope.FindVar(out_grad_var_name);
 
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index c23de9073ef965b989e98936b2dd07fc6bce7fdc..e3953e4b08082c08e1bbf77a834d4a895b327f83 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -63,8 +63,10 @@ class SaveCombineOp : public framework::OperatorBase {
                 const framework::VariableNameMap &outputs,
                 const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 483cdfa4c3b9e3b9abd3f32bc5e6e5e0b493bd23..85ba8e01182c2cd01aa599ddbce68b6b2d9aa5f4 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -62,8 +62,10 @@ class SaveOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index df50a324fde1637f1f9f64a0b0d4eff8ba3f26d2..7fe0526381d1fc18ad0552c321875af42df0f6dc 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -27,8 +27,9 @@ class ShrinkRNNMemoryOp : public ArrayOp {
                     const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x_var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
     auto &x_tensor = x_var->Get<framework::LoDTensor>();
@@ -108,8 +109,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
                         const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
     auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
     PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index f821dc54d7bbe697d3642e64dc1628ec7d966592..f9600d99a36f59feddfbb5295b8b21ca6d5034cd 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -33,8 +33,10 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto *out_true =
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index 06bcf82620bec57346c30b029d23ad8417252248..54420e1bf6ec982545715dc847b0b3e138cf2045 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -38,7 +38,7 @@ class SplitOpKernel : public framework::OpKernel<T> {
       auto out_stride = framework::stride_numel(out->dims());
       StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
                                   out_stride, in->data<T>() + input_offset,
-                                  in_stride);
+                                  in_stride, out_stride[axis]);
       input_offset += out_stride[axis];
     }
   }
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 385124305e2d9afd62313ca46178b4916cd6405d..4c7b90693a2f9ba62d9c30bb601ea4aaebeaf4b5 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -54,7 +54,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
                                      int64_t axis, T* dst,
                                      const framework::DDim& dst_stride_numel,
                                      const T* src,
-                                     const framework::DDim& src_stride_numel) {
+                                     const framework::DDim& src_stride_numel,
+                                     int64_t size) {
   int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
   int64_t src_after = src_stride_numel[axis];
   int64_t dst_after = dst_stride_numel[axis];
@@ -82,15 +83,14 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
     if (platform::is_cpu_place(place)) {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
       memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
-                   src + i * src_after, sizeof(T) * src_after);
+                   src + i * src_after, sizeof(T) * size);
     } else {
 #ifdef PADDLE_WITH_CUDA
       auto& gpu_place = boost::get<platform::CUDAPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
       memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
-                   src + i * src_after, sizeof(T) * src_after,
-                   cuda_ctx.stream());
+                   src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
 #else
       PADDLE_THROW("Paddle is not compiled with GPU");
 #endif
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index 50811fb22491598849216f41a584ae0b68f8f306..704ee964c908c44d84316985429a6551b770e33f 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -24,8 +24,9 @@ class WriteToArrayOp : public ArrayOp {
                  const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) return;
     auto &x_tensor = x->Get<framework::LoDTensor>();
@@ -122,8 +123,10 @@ class ReadFromArrayOp : public ArrayOp {
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x != nullptr, "X must be set");
     auto &x_array = x->Get<framework::LoDTensorArray>();
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index d254c572acff52d967e551c377b3b32b05c92973..a7a05cc5f79da6c1e6945a83f997e54041d2045d 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -39,8 +39,9 @@ class WhileOp : public framework::OperatorBase {
           const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
@@ -99,8 +100,9 @@ class WhileGradOp : public framework::OperatorBase {
               const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 1486d5ed2579a49a4722a8b0abdfdba6bf196615..442a7ea883052e73a5d50d5558f57732be93fb3a 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -204,6 +204,17 @@ function gen_capi_package() {
   fi
 }
 
+function gen_fluid_inference_lib() {
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    cat <<EOF
+    ========================================
+    Building fluid inference library ...
+    ========================================
+EOF
+        make inference_lib_dist
+    fi
+}
+
 set -xe
 
 cmake_gen ${PYTHON_ABI:-""}
@@ -212,6 +223,7 @@ run_test
 gen_docs
 gen_dockerfile
 gen_capi_package
+gen_fluid_inference_lib
 
 if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
   printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index e4675e24b178b2f1745c2b38270ac381ebfe6550..689920af0c4fb85d11c3492d83da2d22d9c4fa6e 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -121,6 +121,7 @@ def split_dense_variable(var_list,
                 block_size += dim1 - remains
         # update split_count after aligning
         split_count = int(math.ceil(var_numel / float(block_size)))
+        print("###split var ", var.name, var.shape, block_size, split_count)
         for block_id in xrange(split_count):
             curr_block_size = min(block_size, var_numel - (
                 (block_id) * block_size))
@@ -191,7 +192,6 @@ class DistributeTranspiler:
         for b in param_blocks:
             varname, block_id, _ = b.split(":")
             send_outputs.append(param_var_mapping[varname][int(block_id)])
-
         # let send_op know which endpoint to send which var to, eplist has the same
         # order as send_inputs.
         eplist = split_method(send_inputs, pserver_endpoints)
@@ -230,21 +230,6 @@ class DistributeTranspiler:
                 outputs={"Out": [orig_param]},
                 attrs={"axis": 0})
 
-        self.lr_param_mapping = self._create_lr_param_mapping()
-
-    def _create_lr_param_mapping(self):
-        lr_mapping = dict()
-        for _, opt_op in enumerate(self.optimize_ops):
-            if not opt_op.inputs or not opt_op.inputs.has_key("LearningRate") \
-              or not opt_op.inputs.has_key("Param"):
-                continue
-            lr = opt_op.inputs["LearningRate"].name
-            param = opt_op.inputs["Param"].name
-            if not lr_mapping.has_key(lr):
-                lr_mapping.update({lr: list()})
-            lr_mapping[lr].append(param)
-        return lr_mapping
-
     def _create_vars_from_blocklist(self, program, block_list):
         # Create respective variables using the block_list
         block_map = dict()
@@ -271,6 +256,7 @@ class DistributeTranspiler:
                 splited_shape = [rows]
                 if len(orig_shape) >= 2:
                     splited_shape.extend(orig_shape[1:])
+                print("###splited: ", size, rows, splited_shape)
                 var = program.global_block().create_var(
                     name="%s.block%d" % (varname, i),
                     psersistable=False,
@@ -278,6 +264,7 @@ class DistributeTranspiler:
                     type=orig_var.type,
                     shape=splited_shape)  # flattend splited var
                 var_mapping[varname].append(var)
+                print("###created split var ", var)
         return var_mapping
 
     def _clone_var(self, block, var):
@@ -369,18 +356,9 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
-    def _fetch_var_names(self, param_dict):
-        res = []
-        if not param_dict:
-            return res
-        for _, values in param_dict.iteritems():
-            if not isinstance(values, list):
-                values = [values]
-            res += [v.name for v in values]
-        return res
-
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
         program = optimize_block.program
+        pserver_block = program.global_block()
         new_inputs = dict()
         # update param/grad shape first, then other inputs like
         # moment can use the updated shape
@@ -395,11 +373,11 @@ class DistributeTranspiler:
                     # do not append this op if current endpoint
                     # is not dealing with this grad block
                     return
-                merged_var = program.global_block().vars[grad_block.name]
+                merged_var = pserver_block.vars[grad_block.name]
                 # append merging ops if trainers > 1
                 if self.trainers > 1:
                     vars2merge = self._create_var_for_trainers(
-                        program.global_block(), grad_block, self.trainers)
+                        pserver_block, grad_block, self.trainers)
                     optimize_block.append_op(
                         type="sum",
                         inputs={"X": vars2merge},
@@ -419,29 +397,27 @@ class DistributeTranspiler:
                         break
                 if not param_block:
                     return
-                tmpvar = program.global_block().create_var(
+                tmpvar = pserver_block.create_var(
                     name=param_block.name,
                     persistable=True,
                     dtype=param_block.dtype,
                     shape=param_block.shape)
-
                 new_inputs[key] = tmpvar
             elif key == "LearningRate":
                 # leraning rate variable has already be created by non-optimize op,
                 # don't create it once again.
-                new_inputs[key] = program.global_block().vars[opt_op.input(key)[
-                    0]]
+                new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
 
         for key in opt_op.input_names:
             new_shape = None
             if key in ["Param", "Grad", "LearningRate"]:
                 continue
-            var = program.global_block().vars[opt_op.input(key)[0]]
+            var = self.program.global_block().vars[opt_op.input(key)[0]]
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                         var.shape, param_shape)
-            tmpvar = program.global_block().create_var(
+            tmpvar = pserver_block.create_var(
                 name=var.name,
                 persistable=var.persistable,
                 dtype=var.dtype,
@@ -449,11 +425,14 @@ class DistributeTranspiler:
             new_inputs[key] = tmpvar
 
         # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
+
         optimize_block.append_op(
             type=opt_op.type,
             inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
@@ -497,11 +476,12 @@ class DistributeTranspiler:
         # If one op's input is another op's output or
         # one op's output is another op's input, we say
         # the two operator is connected.
-        op1_input_names = self._fetch_var_names(op1.inputs)
-        op1_output_names = self._fetch_var_names(op1.outputs)
+        op1_input_names = op1.desc.input_arg_names()
+        op1_output_names = op1.desc.output_arg_names()
+
+        op2_input_names = op2.desc.input_arg_names()
+        op2_output_names = op2.desc.output_arg_names()
 
-        op2_input_names = self._fetch_var_names(op2.inputs)
-        op2_output_names = self._fetch_var_names(op2.outputs)
         if set(op1_output_names) & set(op2_input_names) or \
            set(op1_input_names) & set(op2_output_names):
             return True
@@ -521,8 +501,8 @@ class DistributeTranspiler:
     def _is_opt_op(self, op):
         # NOTE: It's a HACK implement.
         # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
-        if op.inputs and op.inputs.has_key("Param") \
-          and op.inputs.has_key("LearningRate"):
+        if "Param" in op.input_names and \
+            "LearningRate" in op.input_names:
             return True
         return False
 
@@ -530,12 +510,12 @@ class DistributeTranspiler:
         param_names = [
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
-        if op.inputs["Param"].name in param_names:
+        if op.input("Param") in param_names:
             return True
         else:
             for n in param_names:
-                param = op.inputs["Param"].name
-                if same_or_split_var(n, param) and n != op.inputs["Param"].name:
+                param = op.input("Param")[0]
+                if same_or_split_var(n, param) and n != param:
                     return True
             return False
         return False
@@ -551,6 +531,8 @@ class DistributeTranspiler:
         """
         # step5
         pserver_program = Program()
+        print("param mapping on pserver: #### ",
+              self.param_grad_ep_mapping[endpoint]["params"])
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
             self._clone_var(pserver_program.global_block(), v)
         for v in self.param_grad_ep_mapping[endpoint]["grads"]:
@@ -564,7 +546,6 @@ class DistributeTranspiler:
                     persistable=True,
                     dtype=v.dtype,
                     shape=v.shape)
-
         # step6
         optimize_block = pserver_program.create_block(0)
         # step 6.1
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index a517db68c5886fbcbe19e6981aee5bf3971352e4..35d3df785ba4f74ce1681e471e7a83dfdaf71987 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -400,9 +400,6 @@ class Operator(object):
         """
         self.block = block
         self.desc = desc
-        # for clone a new operator
-        self.inputs = inputs
-        self.outputs = outputs
         self.attrs = attrs
         if len(self.desc.type()) != 0:
             return
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
index cfbbf710b6ac63b9a0fe7d51b0d1940532e948fc..f4fb2ca2798ab8ea8c7c634194d2e0c1371a2b93 100644
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -16,8 +16,6 @@ import ops
 from ops import *
 import nn
 from nn import *
-import detection
-from detection import *
 import io
 from io import *
 import tensor
@@ -33,7 +31,6 @@ from detection import *
 
 __all__ = []
 __all__ += math_op_patch.__all__
-__all__ += detection.__all__
 __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__
diff --git a/python/paddle/v2/fluid/layers/detection.py b/python/paddle/v2/fluid/layers/detection.py
index 6d0f12f47503c9dd4fed6e7eba5001555d3c84ce..6af5c8388b7f51563cf7208b89565c5aea2db71f 100644
--- a/python/paddle/v2/fluid/layers/detection.py
+++ b/python/paddle/v2/fluid/layers/detection.py
@@ -1,10 +1,10 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,20 +15,32 @@
 All layers just related to the detection neural network.
 """
 
+from layer_function_generator import generate_layer_fn
 from ..layer_helper import LayerHelper
-from ..param_attr import ParamAttr
-from ..framework import Variable
 import tensor
 import ops
 import nn
 import math
 
 __all__ = [
-    'detection_output',
     'prior_box',
     'multi_box_head',
+    'bipartite_match',
+    'target_assign',
+    'detection_output',
+    'ssd_loss',
+]
+
+__auto__ = [
+    'iou_similarity',
+    'box_coder',
 ]
 
+__all__ += __auto__
+
+for _OP in set(__auto__):
+    globals()[_OP] = generate_layer_fn(_OP)
+
 
 def detection_output(scores,
                      loc,
@@ -98,18 +110,13 @@ def detection_output(scores,
     """
 
     helper = LayerHelper("detection_output", **locals())
-    decoded_box = helper.create_tmp_variable(dtype=loc.dtype)
-    helper.append_op(
-        type="box_coder",
-        inputs={
-            'PriorBox': prior_box,
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': loc
-        },
-        outputs={'OutputBox': decoded_box},
-        attrs={'code_type': 'decode_center_size'})
-    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    decoded_box = box_coder(
+        prior_box=prior_box,
+        prior_box_var=prior_box_var,
+        target_box=loc,
+        code_type='decode_center_size')
 
+    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
         inputs={'Scores': scores,
@@ -280,22 +287,22 @@ def prior_box(inputs,
     if aspect_ratios:
         _is_list_or_tuple_and_equal(
             aspect_ratios, num_layer,
-            'aspect_ratios should be list and the length of inputs '
+            'aspect_ratios should be list or tuple, and the length of inputs '
             'and aspect_ratios should be the same.')
     if step_h:
         _is_list_or_tuple_and_equal(
             step_h, num_layer,
-            'step_h should be list and the length of inputs and '
+            'step_h should be list or tuple, and the length of inputs and '
             'step_h should be the same.')
     if step_w:
         _is_list_or_tuple_and_equal(
             step_w, num_layer,
-            'step_w should be list and the length of inputs and '
+            'step_w should be list or tuple, and the length of inputs and '
             'step_w should be the same.')
     if steps:
         _is_list_or_tuple_and_equal(
             steps, num_layer,
-            'steps should be list and the length of inputs and '
+            'steps should be list or tuple, and the length of inputs and '
             'step_w should be the same.')
         step_w = steps
         step_h = steps
@@ -339,6 +346,331 @@ def prior_box(inputs,
     return box, var
 
 
+def bipartite_match(dist_matrix, name=None):
+    """
+    **Bipartite matchint operator**
+
+    This operator is a greedy bipartite matching algorithm, which is used to
+    obtain the matching with the maximum distance based on the input
+    distance matrix. For input 2D matrix, the bipartite matching algorithm can
+    find the matched column for each row, also can find the matched row for
+    each column. And this operator only calculate matched indices from column
+    to row. For each instance, the number of matched indices is the number of
+    of columns of the input ditance matrix.
+
+    There are two outputs to save matched indices and distance.
+    A simple description, this algothrim matched the best (maximum distance)
+    row entity to the column entity and the matched indices are not duplicated
+    in each row of ColToRowMatchIndices. If the column entity is not matched
+    any row entity, set -1 in ColToRowMatchIndices.
+
+    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+    If Tensor, the height of ColToRowMatchIndices is 1.
+
+    Args:
+        dist_matrix(Variable): This input is a 2-D LoDTensor with shape
+            [K, M]. It is pair-wise distance matrix between the entities
+            represented by each row and each column. For example, assumed one
+            entity is A with shape [K], another entity is B with shape [M]. The
+            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better macthing the pairs are. Please note,
+            This tensor can contain LoD information to represent a batch of
+            inputs. One instance of this batch can contain different numbers of
+            entities.
+    Returns:
+        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
+            N is the batch size. If match_indices[i][j] is -1, it
+            means B[j] does not match any entity in i-th instance.
+            Otherwise, it means B[j] is matched to row
+            match_indices[i][j] in i-th instance. The row number of
+            i-th instance is saved in match_indices[i][j].
+        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
+            N is batch size. If match_indices[i][j] is -1,
+            match_distance[i][j] is also -1.0. Otherwise, assumed
+            match_distance[i][j] = d, and the row offsets of each instance
+            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+    """
+    helper = LayerHelper('bipartite_match', **locals())
+    match_indices = helper.create_tmp_variable(dtype='int32')
+    match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype)
+    helper.append_op(
+        type='bipartite_match',
+        inputs={'DistMat': dist_matrix},
+        outputs={
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_distance
+        })
+    return match_indices, match_distance
+
+
+def target_assign(input,
+                  matched_indices,
+                  negative_indices=None,
+                  mismatch_value=None,
+                  name=None):
+    """
+    **Target assigner operator**
+
+    This operator can be, for given the target bounding boxes or labels,
+    to assign classification and regression targets to each prediction as well as
+    weights to prediction. The weights is used to specify which prediction would
+    not contribute to training loss.
+
+    For each instance, the output `out` and`out_weight` are assigned based on
+    `match_indices` and `negative_indices`.
+    Assumed that the row offset for each instance in `input` is called lod,
+    this operator assigns classification/regression targets by performing the
+    following steps:
+
+    1. Assigning all outpts based on `match_indices`:
+
+    If id = match_indices[i][j] > 0,
+
+        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+        out_weight[i][j] = 1.
+
+    Otherwise,
+
+        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+        out_weight[i][j] = 0.
+
+    2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
+
+    Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
+    for i-th instance and each `id` of neg_indices in this instance:
+
+        out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
+        out_weight[i][id] = 1.0
+
+    Args:
+       inputs (Variable): This input is a 3D LoDTensor with shape [M, P, K].
+       matched_indices (Variable): Tensor<int>), The input matched indices
+           is 2D Tenosr<int32> with shape [N, P], If MatchIndices[i][j] is -1,
+           the j-th entity of column is not matched to any entity of row in
+           i-th instance.
+       negative_indices (Variable): The input negative example indices are
+           an optional input with shape [Neg, 1] and int32 type, where Neg is
+           the total number of negative example indices.
+       mismatch_value (float32): Fill this value to the mismatched location.
+
+    Returns:
+       out (Variable): The output is a 3D Tensor with shape [N, P, K],
+           N and P is the same as they are in `neg_indices`, K is the
+           same as it in input of X. If `match_indices[i][j]`.
+       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+    """
+    helper = LayerHelper('target_assign', **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    out_weight = helper.create_tmp_variable(dtype='float32')
+    helper.append_op(
+        type='target_assign',
+        inputs={
+            'X': input,
+            'MatchIndices': matched_indices,
+            'NegIndices': negative_indices
+        },
+        outputs={'Out': out,
+                 'OutWeight': out_weight},
+        attrs={'mismatch_value': mismatch_value})
+    return out, out_weight
+
+
+def ssd_loss(location,
+             confidence,
+             gt_box,
+             gt_label,
+             prior_box,
+             prior_box_var=None,
+             background_label=0,
+             overlap_threshold=0.5,
+             neg_pos_ratio=3.0,
+             neg_overlap=0.5,
+             loc_loss_weight=1.0,
+             conf_loss_weight=1.0,
+             match_type='per_prediction',
+             mining_type='max_negative',
+             sample_size=None):
+    """
+    **Multi-box loss layer for object dection algorithm of SSD**
+
+    This layer is to compute dection loss for SSD given the location offset
+    predictions, confidence predictions, prior boxes and ground-truth boudding
+    boxes and labels, and the type of hard example mining. The returned loss
+    is a weighted sum of the localization loss (or regression loss) and
+    confidence loss (or classification loss) by performing the following steps:
+
+    1. Find matched boundding box by bipartite matching algorithm.
+      1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+      1.2 Compute matched boundding box by bipartite matching algorithm.
+    2. Compute confidence for mining hard examples
+      2.1. Get the target label based on matched indices.
+      2.2. Compute confidence loss.
+    3. Apply hard example mining to get the negative example indices and update
+       the matched indices.
+    4. Assign classification and regression targets
+      4.1. Encoded bbox according to the prior boxes.
+      4.2. Assign regression targets.
+      4.3. Assign classification targets.
+    5. Compute the overall objective loss.
+      5.1 Compute confidence loss.
+      5.1 Compute localization loss.
+      5.3 Compute the overall weighted loss.
+
+    Args:
+        location (Variable): The location predictions are a 3D Tensor with
+            shape [N, Np, 4], N is the batch size, Np is total number of
+            predictions for each instance. 4 is the number of coordinate values,
+            the layout is [xmin, ymin, xmax, ymax].
+        confidence (Variable): The confidence predictions are a 3D Tensor
+            with shape [N, Np, C], N and Np are the same as they are in
+            `location`, C is the class number.
+        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        gt_label (Variable): The ground-truth labels are a 2D LoDTensor
+            with shape [Ng, 1].
+        prior_box (Variable): The prior boxes are a 2D Tensor with shape [Np, 4].
+        prior_box_var (Variable): The variance of prior boxes are a 2D Tensor
+            with shape [Np, 4].
+        background_label (int): The index of background label, 0 by default.
+        overlap_threshold (float): If match_type is 'per_prediction', use
+            `overlap_threshold` to determine the extra matching bboxes when
+             finding matched boxes. 0.5 by default.
+        neg_pos_ratio (float): The ratio of the negative boxes to the positive
+            boxes, used only when mining_type is max_negative, 3.0 by defalut.
+        neg_overlap (float): The negative overlap upper bound for the unmatched
+            predictions. Use only when mining_type is max_negative,
+            0.5 by default.
+        sample_size (int): The max sample size of negative box, used only when
+            mining_type is hard_example.
+        loc_loss_weight (float): Weight for localization loss, 1.0 by default.
+        conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
+        match_type (str): The type of matching method during training, should
+            be 'bipartite' or 'per_prediction'.
+        mining_type (str): The hard example mining type, should be 'hard_example'
+            or 'max_negative', now only support `max_negative`.
+
+    Returns:
+        Variable: The weighted sum of the localization loss and confidence loss,
+            with shape [N * Np, 1], N and Np are the same as they are
+            in `location`.
+
+    Raises:
+        ValueError: If mining_type is 'hard_example', now only support
+            mining type of `max_negative`.
+
+    Examples:
+        .. code-block:: python
+
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+            gt_box = layers.data(
+                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+            gt_label = layers.data(
+                name='gt_label', shape=[1], lod_level=1, dtype='float32')
+            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+    """
+
+    helper = LayerHelper('ssd_loss', **locals())
+    if mining_type != 'max_negative':
+        raise ValueError("Only support mining_type == max_negative now.")
+
+    num, num_prior, num_class = confidence.shape
+
+    def __reshape_to_2d(var):
+        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+
+    # 1. Find matched boundding box by prior box.
+    #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+    iou = iou_similarity(x=gt_box, y=prior_box)
+    #   1.2 Compute matched boundding box by bipartite matching algorithm.
+    matched_indices, matched_dist = bipartite_match(iou)
+
+    # 2. Compute confidence for mining hard examples
+    # 2.1. Get the target label based on matched indices
+    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    target_label, _ = target_assign(
+        gt_label, matched_indices, mismatch_value=background_label)
+    # 2.2. Compute confidence loss.
+    # Reshape confidence to 2D tensor.
+    confidence = __reshape_to_2d(confidence)
+    target_label = tensor.cast(x=target_label, dtype='int64')
+    target_label = __reshape_to_2d(target_label)
+    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
+
+    # 3. Mining hard examples
+    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    neg_indices = helper.create_tmp_variable(dtype='int32')
+    dtype = matched_indices.dtype
+    updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='mine_hard_examples',
+        inputs={
+            'ClsLoss': conf_loss,
+            'LocLoss': None,
+            'MatchIndices': matched_indices,
+            'MatchDist': matched_dist,
+        },
+        outputs={
+            'NegIndices': neg_indices,
+            'UpdatedMatchIndices': updated_matched_indices
+        },
+        attrs={
+            'neg_pos_ratio': neg_pos_ratio,
+            'neg_dist_threshold': neg_pos_ratio,
+            'mining_type': mining_type,
+            'sample_size': sample_size,
+        })
+
+    # 4. Assign classification and regression targets
+    # 4.1. Encoded bbox according to the prior boxes.
+    encoded_bbox = box_coder(
+        prior_box=prior_box,
+        prior_box_var=prior_box_var,
+        target_box=gt_box,
+        code_type='encode_center_size')
+    # 4.2. Assign regression targets
+    target_bbox, target_loc_weight = target_assign(
+        encoded_bbox, updated_matched_indices, mismatch_value=background_label)
+    # 4.3. Assign classification targets
+    target_label, target_conf_weight = target_assign(
+        gt_label,
+        updated_matched_indices,
+        negative_indices=neg_indices,
+        mismatch_value=background_label)
+
+    # 5. Compute loss.
+    # 5.1 Compute confidence loss.
+    target_label = __reshape_to_2d(target_label)
+    target_label = tensor.cast(x=target_label, dtype='int64')
+    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
+    target_conf_weight = __reshape_to_2d(target_conf_weight)
+    conf_loss = conf_loss * target_conf_weight
+
+    # 5.2 Compute regression loss.
+    location = __reshape_to_2d(location)
+    target_bbox = __reshape_to_2d(target_bbox)
+
+    loc_loss = nn.smooth_l1(location, target_bbox)
+    target_loc_weight = __reshape_to_2d(target_loc_weight)
+    loc_loss = loc_loss * target_loc_weight
+
+    # 5.3 Compute overall weighted loss.
+    loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
+    return loss
+
+
 def multi_box_head(inputs,
                    num_classes,
                    min_sizes=None,
diff --git a/python/paddle/v2/fluid/tests/test_detection.py b/python/paddle/v2/fluid/tests/test_detection.py
index 2f1ecd66775cda506b880a85ebbe7c29e0c0857a..dd28a05313cd38665a6390552fc3247b84949f04 100644
--- a/python/paddle/v2/fluid/tests/test_detection.py
+++ b/python/paddle/v2/fluid/tests/test_detection.py
@@ -15,12 +15,11 @@
 from __future__ import print_function
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.layers.detection as detection
 from paddle.v2.fluid.framework import Program, program_guard
 import unittest
 
 
-class TestBook(unittest.TestCase):
+class TestDetection(unittest.TestCase):
     def test_detection_output(self):
         program = Program()
         with program_guard(program):
@@ -47,7 +46,67 @@ class TestBook(unittest.TestCase):
             out = layers.detection_output(
                 scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv)
             self.assertIsNotNone(out)
-        # print(str(program))
+            self.assertEqual(out.shape[-1], 6)
+        print(str(program))
+
+    def test_detection_api(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[4], dtype='float32')
+            y = layers.data(name='y', shape=[4], dtype='float32')
+            z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
+            iou = layers.iou_similarity(x=x, y=y)
+            bcoder = layers.box_coder(
+                prior_box=x,
+                prior_box_var=y,
+                target_box=z,
+                code_type='encode_center_size')
+            self.assertIsNotNone(iou)
+            self.assertIsNotNone(bcoder)
+
+            matched_indices, matched_dist = layers.bipartite_match(iou)
+            self.assertIsNotNone(matched_indices)
+            self.assertIsNotNone(matched_dist)
+
+            gt = layers.data(
+                name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                gt, matched_indices, mismatch_value=0)
+            self.assertIsNotNone(trg)
+            self.assertIsNotNone(trg_weight)
+
+            gt2 = layers.data(
+                name='gt2', shape=[10, 4], dtype='float32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                gt2, matched_indices, mismatch_value=0)
+            self.assertIsNotNone(trg)
+            self.assertIsNotNone(trg_weight)
+
+        print(str(program))
+
+    def test_ssd_loss(self):
+        program = Program()
+        with program_guard(program):
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+            gt_box = layers.data(
+                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+            gt_label = layers.data(
+                name='gt_label', shape=[1], lod_level=1, dtype='int32')
+            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+            self.assertIsNotNone(loss)
+            self.assertEqual(loss.shape[-1], 1)
+        print(str(program))
 
 
 class TestPriorBox(unittest.TestCase):
@@ -68,7 +127,7 @@ class TestPriorBox(unittest.TestCase):
         conv4 = fluid.layers.conv2d(conv3, 3, 3, 2)
         conv5 = fluid.layers.conv2d(conv4, 3, 3, 2)
 
-        box, var = detection.prior_box(
+        box, var = layers.prior_box(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
             image=images,
             min_ratio=20,