remove conflict

8ea2288e · chengduoZH · 84d9c690 · 057efd17 · 8ea2288e · 8ea2288e
52 changed file
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -16,12 +16,10 @@ function(copy TARGET)
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
-        if(IS_DIRECTORY ${src})
+          COMMAND mkdir -p "${dst}"
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+          COMMAND cp -r "${src}" "${dst}"
-        else()
+          COMMENT "copying ${src} -> ${dst}")
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
-        endif()
    endforeach()
 endfunction()
@@ -53,11 +51,11 @@ IF(NOT PROTOBUF_FOUND)
 ENDIF(NOT PROTOBUF_FOUND)
 # paddle fluid module
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
@@ -69,7 +67,7 @@ copy(memory_lib
 set(module "inference")
 copy(inference_lib DEPENDS paddle_fluid_shared
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )

--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -25,7 +25,10 @@ namespace framework {
 class CosineOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -44,7 +47,10 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -64,6 +64,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
  }
 }
+void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  if (platform::is_gpu_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("Cannot run operator on place %s", place);
+#else
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
+#endif
+  }
+  RunImpl(scope, place);
+}
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -479,8 +491,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const Scope& scope_;
 };
-void OperatorWithKernel::Run(const Scope& scope,
+void OperatorWithKernel::RunImpl(const Scope& scope,
-                             const platform::Place& place) const {
+                                 const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -89,8 +89,9 @@ class OperatorBase {
  std::string DebugString() const { return DebugStringEx(nullptr); }
-  /// Net will call this function to Run an op.
+  /// Net will call this interface function to Run an op.
-  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+  //  The implementation should be written at RunImpl
+  void Run(const Scope& scope, const platform::Place& place);
  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
  virtual void Stop() {}
@@ -144,6 +145,8 @@ class OperatorBase {
 private:
  void GenerateTemporaryNames();
  void CheckAllInputOutputSet() const;
+  virtual void RunImpl(const Scope& scope,
+                       const platform::Place& place) const = 0;
 };
 // Macro for define a clone method.
@@ -168,10 +171,13 @@ class OperatorBase {
 class NOP : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
  std::unique_ptr<OperatorBase> Clone() const override {
    return std::unique_ptr<OperatorBase>(new NOP(*this));
  }
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 class ExecutionContext {
@@ -363,8 +369,6 @@ class OperatorWithKernel : public OperatorBase {
                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const Scope& scope, const platform::Place& place) const final;
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
  AllOpKernels() {
    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
@@ -393,6 +397,7 @@ class OperatorWithKernel : public OperatorBase {
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
 };
 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -28,7 +28,10 @@ class OpWithoutKernelTest : public OperatorBase {
  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope, const platform::Place& place) const override {
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {
    ++op_run_num;
    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -259,8 +262,10 @@ class OperatorClone : public paddle::framework::OperatorBase {
                const paddle::framework::VariableNameMap& outputs,
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::Place& place) const override {}
+ private:
+  void RunImpl(const paddle::framework::Scope& scope,
+               const paddle::platform::Place& place) const override {}
 };
 TEST(Operator, Clone) {

--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -31,8 +31,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                     const framework::VariableNameMap &outputs,
                     const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
    auto &rank_table =
        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();

--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -71,8 +71,10 @@ class AssignOp : public framework::OperatorBase {
           const framework::VariableNameMap &outputs,
           const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    if (x == nullptr) {
      return;

--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -55,8 +55,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                     const framework::VariableNameMap& outputs,
                     const framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& dev_ctx = *pool.Get(dev_place);

--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -204,8 +204,9 @@ class BeamSearchOp : public framework::OperatorBase {
    PADDLE_THROW("Not Implemented");
  }
-  void Run(const framework::Scope& scope,
+ private:
-           const platform::Place& dev_place) const override {
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
    auto ids_var = scope.FindVar(Input("ids"));
    auto scores_var = scope.FindVar(Input("scores"));
    auto pre_ids_var = scope.FindVar(Input("pre_ids"));

--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -38,7 +38,7 @@ class ConcatKernel : public framework::OpKernel<T> {
      auto in_stride = framework::stride_numel(in->dims());
      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
                                  out->data<T>() + output_offset, out_stride,
-                                  in->data<T>(), in_stride);
+                                  in->data<T>(), in_stride, in_stride[axis]);
      output_offset += in_stride[axis];
    }
  }
@@ -59,7 +59,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
      auto out_stride = framework::stride_numel(out->dims());
      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride);
+                                  in_stride, out_stride[axis]);
      input_offset += out_stride[axis];
    }
  }

--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
@@ -193,7 +193,7 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
  }
 }
-void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
  // get device context from pool
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& dev_ctx = *pool.Get(place);

--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
@@ -77,8 +77,9 @@ class CondOp : public framework::OperatorBase {
    sub_net_op_[FALSE_BRANCH] = std::move(net);
  }
-  void Run(const framework::Scope& scope,
+ private:
-           const platform::Place& place) const override;
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override;
 private:
  const int TRUE_BRANCH = 0;

--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -65,8 +65,10 @@ class ConditionalBlockOp : public ConditionalOp {
                     const framework::VariableNameMap &outputs,
                     const framework::AttributeMap &attrs)
      : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto xs = InputTensors(scope);
    bool need_run;
@@ -128,8 +130,10 @@ class ConditionalBlockGradOp : public ConditionalOp {
                         const framework::VariableNameMap &outputs,
                         const framework::AttributeMap &attrs)
      : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto xs = this->InputTensors(scope);
    bool need_run;

--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
@@ -106,8 +106,10 @@ template <typename T>
 class CreateRandomDataGeneratorOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
    const auto& ranks = Attr<std::vector<int>>("ranks");
    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
@@ -155,8 +157,10 @@ class CreateRandomDataGeneratorOpMaker
 class CreateShuffleReaderOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                        ->Get<framework::ReaderHolder>();
    auto* out = scope.FindVar(Output("Out"))
@@ -187,8 +191,10 @@ class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
 class CreateBatchReaderOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                        ->Get<framework::ReaderHolder>();
    auto* out = scope.FindVar(Output("Out"))

--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -24,8 +24,10 @@ class FeedOp : public framework::OperatorBase {
         const framework::VariableNameMap &outputs,
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto feed_var_name = Input("X");
    auto *feed_var = scope.FindVar(feed_var_name);

--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -26,8 +26,9 @@ class FetchOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto fetch_var_name = Input("X");
    auto *fetch_var = scope.FindVar(fetch_var_name);
    PADDLE_ENFORCE(fetch_var != nullptr,

--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -33,8 +33,10 @@ class FillConstantInferShape : public framework::InferShapeBase {
 class FillConstantOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto data_type =
        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    auto value = Attr<float>("value");

--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -42,8 +42,10 @@ class FillOp : public framework::OperatorBase {
         const framework::VariableNameMap &outputs,
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto &out =
        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                "Cannot find variable %s", Output("Out"))

--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -37,8 +37,10 @@ class GetPlacesOp : public framework::OperatorBase {
              const framework::VariableNameMap &outputs,
              const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    bool is_gpu;
    if (Attr<std::string>("device_type") == "AUTO") {
      is_gpu = platform::is_gpu_place(place);

--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -51,8 +51,9 @@ class IncrementOp : public framework::OperatorBase {
              const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &out =
        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -28,8 +28,9 @@ class IsEmptyOp : public framework::OperatorBase {
            const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    // get input
    auto *var = scope.FindVar(Input(kInput));
    PADDLE_ENFORCE_NOT_NULL(var);

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -26,8 +26,10 @@ class LoadCombineOp : public framework::OperatorBase {
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    std::ifstream fin(filename);

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -25,8 +25,10 @@ class LoadOp : public framework::OperatorBase {
         const framework::VariableNameMap &outputs,
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    std::ifstream fin(filename);
    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",

--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -25,8 +25,10 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
    auto &out =
        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -23,8 +23,10 @@ class LoDRankTableOp : public framework::OperatorBase {
                 const framework::VariableNameMap &outputs,
                 const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto *out =
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();

--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -32,8 +32,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                     const framework::VariableNameMap &outputs,
                     const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                          Input("X"))
                  .Get<framework::LoDTensor>();

--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -27,8 +27,9 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
                  const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &dev_place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto &rank_table =
        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
    auto *out =

--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -27,8 +27,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    // get device context from pool
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);

--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -237,6 +237,8 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+    // The first dimension of NegIndices will be set correcttly in Compute.
+    ctx->SetOutputDim("NegIndices", {-1, 1});
  }
 protected:

--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -26,8 +26,9 @@ class NCCLInitOp : public framework::OperatorBase {
             const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    const auto &name = Output("Communicator");
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                            "Can not find variable '%s' in the scope.", name);

--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
@@ -57,20 +57,6 @@ class NetOp : public framework::OperatorBase {
    this->CompleteAddOp();
  }
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
  bool SupportGPU() const override {
    for (auto& op : ops_) {
      if (!op->SupportGPU()) {
@@ -117,6 +103,20 @@ class NetOp : public framework::OperatorBase {
  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
 private:
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, place);
+    }
+  }
  bool add_op_done_{false};
  std::set<std::string> intermediate_outputs_;

--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
@@ -26,7 +26,10 @@ class TestOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope, const platform::Place& place) const override {
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {
    ++run_cnt;
  }
 };

--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -118,8 +118,9 @@ class ParallelDoOp : public framework::OperatorBase {
               const framework::AttributeMap &attrs)
      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    // get device context from pool
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
@@ -207,8 +208,9 @@ class ParallelDoGradOp : public framework::OperatorBase {
                   const framework::AttributeMap &attrs)
      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
    auto *program = block->Program();

--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -130,8 +130,9 @@ class TensorPrintOp : public framework::OperatorBase {
    PADDLE_THROW("Not implemented.");
  }
-  void Run(const framework::Scope& scope,
+ private:
-           const platform::Place& place) const override {
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
    const framework::Variable* in_var_ptr = nullptr;
    std::string phase = kForward;
    std::string printed_var_name = "";

--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -54,8 +54,10 @@ class ReadInferVarType : public framework::VarTypeInference {
 class ReadOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
    framework::ReaderHolder* reader =
        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
    if (!reader->HasNext()) {

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -226,8 +226,9 @@ class RecurrentOp : public RecurrentBase {
              const framework::AttributeMap &attrs)
      : RecurrentBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
    VLOG(3) << "Static RNN input sequence length = " << seq_len;
    StepScopes scopes = CreateStepScopes(scope, seq_len);
@@ -315,8 +316,9 @@ class RecurrentGradOp : public RecurrentBase {
                  const framework::AttributeMap &attrs)
      : RecurrentBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
    StepScopes scopes = CreateStepScopes(scope, seq_len);
    auto reverse = Attr<bool>(kReverse);

--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -75,8 +75,10 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
                                  const framework::VariableNameMap &outputs,
                                  const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto &x =
        detail::Ref(scope.FindVar(Input("X")),
                    "Cannot find input lod tensor variable %s", Input("X"))

--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -24,8 +24,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto mem_var_name = Input("X");
    auto *mem_var = scope.FindVar(mem_var_name);
    PADDLE_ENFORCE(mem_var != nullptr,
@@ -76,8 +78,10 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                        const framework::VariableNameMap &outputs,
                        const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto out_grad_var_name = Input(framework::GradVarName("Out"));
    auto *out_grad_var = scope.FindVar(out_grad_var_name);

--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -63,8 +63,10 @@ class SaveCombineOp : public framework::OperatorBase {
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    auto overwrite = Attr<bool>("overwrite");

--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -62,8 +62,10 @@ class SaveOp : public framework::OperatorBase {
         const framework::VariableNameMap &outputs,
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    auto overwrite = Attr<bool>("overwrite");

--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -27,8 +27,9 @@ class ShrinkRNNMemoryOp : public ArrayOp {
                    const framework::AttributeMap &attrs)
      : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto *x_var = scope.FindVar(Input("X"));
    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
    auto &x_tensor = x_var->Get<framework::LoDTensor>();
@@ -108,8 +109,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
                        const framework::AttributeMap &attrs)
      : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");

--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -33,8 +33,10 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
    auto *out_true =

--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -38,7 +38,7 @@ class SplitOpKernel : public framework::OpKernel<T> {
      auto out_stride = framework::stride_numel(out->dims());
      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride);
+                                  in_stride, out_stride[axis]);
      input_offset += out_stride[axis];
    }
  }

--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -54,7 +54,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
                                     int64_t axis, T* dst,
                                     const framework::DDim& dst_stride_numel,
                                     const T* src,
-                                     const framework::DDim& src_stride_numel) {
+                                     const framework::DDim& src_stride_numel,
+                                     int64_t size) {
  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
  int64_t src_after = src_stride_numel[axis];
  int64_t dst_after = dst_stride_numel[axis];
@@ -82,15 +83,14 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
    if (platform::is_cpu_place(place)) {
      auto& cpu_place = boost::get<platform::CPUPlace>(place);
      memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
-                   src + i * src_after, sizeof(T) * src_after);
+                   src + i * src_after, sizeof(T) * size);
    } else {
 #ifdef PADDLE_WITH_CUDA
      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
      auto& cuda_ctx =
          reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
      memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
-                   src + i * src_after, sizeof(T) * src_after,
+                   src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
-                   cuda_ctx.stream());
 #else
      PADDLE_THROW("Paddle is not compiled with GPU");
 #endif

--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -24,8 +24,9 @@ class WriteToArrayOp : public ArrayOp {
                 const framework::AttributeMap &attrs)
      : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    if (x == nullptr) return;
    auto &x_tensor = x->Get<framework::LoDTensor>();
@@ -122,8 +123,10 @@ class ReadFromArrayOp : public ArrayOp {
                  const framework::VariableNameMap &outputs,
                  const framework::AttributeMap &attrs)
      : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    PADDLE_ENFORCE(x != nullptr, "X must be set");
    auto &x_array = x->Get<framework::LoDTensorArray>();

--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -39,8 +39,9 @@ class WhileOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &dev_place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
@@ -99,8 +100,9 @@ class WhileGradOp : public framework::OperatorBase {
              const framework::AttributeMap &attrs)
      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &dev_place) const override {
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
    // get device context from pool
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -204,6 +204,17 @@ function gen_capi_package() {
  fi
 }
+function gen_fluid_inference_lib() {
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    cat <<EOF
+    ========================================
+    Building fluid inference library ...
+    ========================================
+EOF
+        make inference_lib_dist
+    fi
+}
 set -xe
 cmake_gen ${PYTHON_ABI:-""}
@@ -212,6 +223,7 @@ run_test
 gen_docs
 gen_dockerfile
 gen_capi_package
+gen_fluid_inference_lib
 if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 

--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -121,6 +121,7 @@ def split_dense_variable(var_list,
                block_size += dim1 - remains
        # update split_count after aligning
        split_count = int(math.ceil(var_numel / float(block_size)))
+        print("###split var ", var.name, var.shape, block_size, split_count)
        for block_id in xrange(split_count):
            curr_block_size = min(block_size, var_numel - (
                (block_id) * block_size))
@@ -191,7 +192,6 @@ class DistributeTranspiler:
        for b in param_blocks:
            varname, block_id, _ = b.split(":")
            send_outputs.append(param_var_mapping[varname][int(block_id)])
        # let send_op know which endpoint to send which var to, eplist has the same
        # order as send_inputs.
        eplist = split_method(send_inputs, pserver_endpoints)
@@ -230,21 +230,6 @@ class DistributeTranspiler:
                outputs={"Out": [orig_param]},
                attrs={"axis": 0})
-        self.lr_param_mapping = self._create_lr_param_mapping()
-    def _create_lr_param_mapping(self):
-        lr_mapping = dict()
-        for _, opt_op in enumerate(self.optimize_ops):
-            if not opt_op.inputs or not opt_op.inputs.has_key("LearningRate") \
-              or not opt_op.inputs.has_key("Param"):
-                continue
-            lr = opt_op.inputs["LearningRate"].name
-            param = opt_op.inputs["Param"].name
-            if not lr_mapping.has_key(lr):
-                lr_mapping.update({lr: list()})
-            lr_mapping[lr].append(param)
-        return lr_mapping
    def _create_vars_from_blocklist(self, program, block_list):
        # Create respective variables using the block_list
        block_map = dict()
@@ -271,6 +256,7 @@ class DistributeTranspiler:
                splited_shape = [rows]
                if len(orig_shape) >= 2:
                    splited_shape.extend(orig_shape[1:])
+                print("###splited: ", size, rows, splited_shape)
                var = program.global_block().create_var(
                    name="%s.block%d" % (varname, i),
                    psersistable=False,
@@ -278,6 +264,7 @@ class DistributeTranspiler:
                    type=orig_var.type,
                    shape=splited_shape)  # flattend splited var
                var_mapping[varname].append(var)
+                print("###created split var ", var)
        return var_mapping
    def _clone_var(self, block, var):
@@ -369,18 +356,9 @@ class DistributeTranspiler:
            pass
        return orig_shape
-    def _fetch_var_names(self, param_dict):
-        res = []
-        if not param_dict:
-            return res
-        for _, values in param_dict.iteritems():
-            if not isinstance(values, list):
-                values = [values]
-            res += [v.name for v in values]
-        return res
    def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
        program = optimize_block.program
+        pserver_block = program.global_block()
        new_inputs = dict()
        # update param/grad shape first, then other inputs like
        # moment can use the updated shape
@@ -395,11 +373,11 @@ class DistributeTranspiler:
                    # do not append this op if current endpoint
                    # is not dealing with this grad block
                    return
-                merged_var = program.global_block().vars[grad_block.name]
+                merged_var = pserver_block.vars[grad_block.name]
                # append merging ops if trainers > 1
                if self.trainers > 1:
                    vars2merge = self._create_var_for_trainers(
-                        program.global_block(), grad_block, self.trainers)
+                        pserver_block, grad_block, self.trainers)
                    optimize_block.append_op(
                        type="sum",
                        inputs={"X": vars2merge},
@@ -419,29 +397,27 @@ class DistributeTranspiler:
                        break
                if not param_block:
                    return
-                tmpvar = program.global_block().create_var(
+                tmpvar = pserver_block.create_var(
                    name=param_block.name,
                    persistable=True,
                    dtype=param_block.dtype,
                    shape=param_block.shape)
                new_inputs[key] = tmpvar
            elif key == "LearningRate":
                # leraning rate variable has already be created by non-optimize op,
                # don't create it once again.
-                new_inputs[key] = program.global_block().vars[opt_op.input(key)[
+                new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
-                    0]]
        for key in opt_op.input_names:
            new_shape = None
            if key in ["Param", "Grad", "LearningRate"]:
                continue
-            var = program.global_block().vars[opt_op.input(key)[0]]
+            var = self.program.global_block().vars[opt_op.input(key)[0]]
            # update accumulator variable shape
            param_shape = new_inputs["Param"].shape
            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                        var.shape, param_shape)
-            tmpvar = program.global_block().create_var(
+            tmpvar = pserver_block.create_var(
                name=var.name,
                persistable=var.persistable,
                dtype=var.dtype,
@@ -449,11 +425,14 @@ class DistributeTranspiler:
            new_inputs[key] = tmpvar
        # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
        optimize_block.append_op(
            type=opt_op.type,
            inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
            attrs=opt_op.attrs)
    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
@@ -497,11 +476,12 @@ class DistributeTranspiler:
        # If one op's input is another op's output or
        # one op's output is another op's input, we say
        # the two operator is connected.
-        op1_input_names = self._fetch_var_names(op1.inputs)
+        op1_input_names = op1.desc.input_arg_names()
-        op1_output_names = self._fetch_var_names(op1.outputs)
+        op1_output_names = op1.desc.output_arg_names()
+        op2_input_names = op2.desc.input_arg_names()
+        op2_output_names = op2.desc.output_arg_names()
-        op2_input_names = self._fetch_var_names(op2.inputs)
-        op2_output_names = self._fetch_var_names(op2.outputs)
        if set(op1_output_names) & set(op2_input_names) or \
           set(op1_input_names) & set(op2_output_names):
            return True
@@ -521,8 +501,8 @@ class DistributeTranspiler:
    def _is_opt_op(self, op):
        # NOTE: It's a HACK implement.
        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
-        if op.inputs and op.inputs.has_key("Param") \
+        if "Param" in op.input_names and \
-          and op.inputs.has_key("LearningRate"):
+            "LearningRate" in op.input_names:
            return True
        return False
@@ -530,12 +510,12 @@ class DistributeTranspiler:
        param_names = [
            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
        ]
-        if op.inputs["Param"].name in param_names:
+        if op.input("Param") in param_names:
            return True
        else:
            for n in param_names:
-                param = op.inputs["Param"].name
+                param = op.input("Param")[0]
-                if same_or_split_var(n, param) and n != op.inputs["Param"].name:
+                if same_or_split_var(n, param) and n != param:
                    return True
            return False
        return False
@@ -551,6 +531,8 @@ class DistributeTranspiler:
        """
        # step5
        pserver_program = Program()
+        print("param mapping on pserver: #### ",
+              self.param_grad_ep_mapping[endpoint]["params"])
        for v in self.param_grad_ep_mapping[endpoint]["params"]:
            self._clone_var(pserver_program.global_block(), v)
        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
@@ -564,7 +546,6 @@ class DistributeTranspiler:
                    persistable=True,
                    dtype=v.dtype,
                    shape=v.shape)
        # step6
        optimize_block = pserver_program.create_block(0)
        # step 6.1

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -400,9 +400,6 @@ class Operator(object):
        """
        self.block = block
        self.desc = desc
-        # for clone a new operator
-        self.inputs = inputs
-        self.outputs = outputs
        self.attrs = attrs
        if len(self.desc.type()) != 0:
            return

--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -16,8 +16,6 @@ import ops
 from ops import *
 import nn
 from nn import *
-import detection
-from detection import *
 import io
 from io import *
 import tensor
@@ -33,7 +31,6 @@ from detection import *
 __all__ = []
 __all__ += math_op_patch.__all__
-__all__ += detection.__all__
 __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__

--- a/python/paddle/v2/fluid/layers/detection.py
+++ b/python/paddle/v2/fluid/layers/detection.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,20 +15,32 @@
 All layers just related to the detection neural network.
 """
+from layer_function_generator import generate_layer_fn
 from ..layer_helper import LayerHelper
-from ..param_attr import ParamAttr
-from ..framework import Variable
 import tensor
 import ops
 import nn
 import math
 __all__ = [
-    'detection_output',
    'prior_box',
    'multi_box_head',
+    'bipartite_match',
+    'target_assign',
+    'detection_output',
+    'ssd_loss',
+]
+__auto__ = [
+    'iou_similarity',
+    'box_coder',
 ]
+__all__ += __auto__
+for _OP in set(__auto__):
+    globals()[_OP] = generate_layer_fn(_OP)
 def detection_output(scores,
                     loc,
@@ -98,18 +110,13 @@ def detection_output(scores,
    """
    helper = LayerHelper("detection_output", **locals())
-    decoded_box = helper.create_tmp_variable(dtype=loc.dtype)
+    decoded_box = box_coder(
-    helper.append_op(
+        prior_box=prior_box,
-        type="box_coder",
+        prior_box_var=prior_box_var,
-        inputs={
+        target_box=loc,
-            'PriorBox': prior_box,
+        code_type='decode_center_size')
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': loc
-        },
-        outputs={'OutputBox': decoded_box},
-        attrs={'code_type': 'decode_center_size'})
-    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
    helper.append_op(
        type="multiclass_nms",
        inputs={'Scores': scores,
@@ -280,22 +287,22 @@ def prior_box(inputs,
    if aspect_ratios:
        _is_list_or_tuple_and_equal(
            aspect_ratios, num_layer,
-            'aspect_ratios should be list and the length of inputs '
+            'aspect_ratios should be list or tuple, and the length of inputs '
            'and aspect_ratios should be the same.')
    if step_h:
        _is_list_or_tuple_and_equal(
            step_h, num_layer,
-            'step_h should be list and the length of inputs and '
+            'step_h should be list or tuple, and the length of inputs and '
            'step_h should be the same.')
    if step_w:
        _is_list_or_tuple_and_equal(
            step_w, num_layer,
-            'step_w should be list and the length of inputs and '
+            'step_w should be list or tuple, and the length of inputs and '
            'step_w should be the same.')
    if steps:
        _is_list_or_tuple_and_equal(
            steps, num_layer,
-            'steps should be list and the length of inputs and '
+            'steps should be list or tuple, and the length of inputs and '
            'step_w should be the same.')
        step_w = steps
        step_h = steps
@@ -339,6 +346,331 @@ def prior_box(inputs,
    return box, var
+def bipartite_match(dist_matrix, name=None):
+    """
+    **Bipartite matchint operator**
+    This operator is a greedy bipartite matching algorithm, which is used to
+    obtain the matching with the maximum distance based on the input
+    distance matrix. For input 2D matrix, the bipartite matching algorithm can
+    find the matched column for each row, also can find the matched row for
+    each column. And this operator only calculate matched indices from column
+    to row. For each instance, the number of matched indices is the number of
+    of columns of the input ditance matrix.
+    There are two outputs to save matched indices and distance.
+    A simple description, this algothrim matched the best (maximum distance)
+    row entity to the column entity and the matched indices are not duplicated
+    in each row of ColToRowMatchIndices. If the column entity is not matched
+    any row entity, set -1 in ColToRowMatchIndices.
+    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+    If Tensor, the height of ColToRowMatchIndices is 1.
+    Args:
+        dist_matrix(Variable): This input is a 2-D LoDTensor with shape
+            [K, M]. It is pair-wise distance matrix between the entities
+            represented by each row and each column. For example, assumed one
+            entity is A with shape [K], another entity is B with shape [M]. The
+            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better macthing the pairs are. Please note,
+            This tensor can contain LoD information to represent a batch of
+            inputs. One instance of this batch can contain different numbers of
+            entities.
+    Returns:
+        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
+            N is the batch size. If match_indices[i][j] is -1, it
+            means B[j] does not match any entity in i-th instance.
+            Otherwise, it means B[j] is matched to row
+            match_indices[i][j] in i-th instance. The row number of
+            i-th instance is saved in match_indices[i][j].
+        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
+            N is batch size. If match_indices[i][j] is -1,
+            match_distance[i][j] is also -1.0. Otherwise, assumed
+            match_distance[i][j] = d, and the row offsets of each instance
+            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+    """
+    helper = LayerHelper('bipartite_match', **locals())
+    match_indices = helper.create_tmp_variable(dtype='int32')
+    match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype)
+    helper.append_op(
+        type='bipartite_match',
+        inputs={'DistMat': dist_matrix},
+        outputs={
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_distance
+        })
+    return match_indices, match_distance
+def target_assign(input,
+                  matched_indices,
+                  negative_indices=None,
+                  mismatch_value=None,
+                  name=None):
+    """
+    **Target assigner operator**
+    This operator can be, for given the target bounding boxes or labels,
+    to assign classification and regression targets to each prediction as well as
+    weights to prediction. The weights is used to specify which prediction would
+    not contribute to training loss.
+    For each instance, the output `out` and`out_weight` are assigned based on
+    `match_indices` and `negative_indices`.
+    Assumed that the row offset for each instance in `input` is called lod,
+    this operator assigns classification/regression targets by performing the
+    following steps:
+    1. Assigning all outpts based on `match_indices`:
+    If id = match_indices[i][j] > 0,
+        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+        out_weight[i][j] = 1.
+    Otherwise,
+        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+        out_weight[i][j] = 0.
+    2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
+    Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
+    for i-th instance and each `id` of neg_indices in this instance:
+        out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
+        out_weight[i][id] = 1.0
+    Args:
+       inputs (Variable): This input is a 3D LoDTensor with shape [M, P, K].
+       matched_indices (Variable): Tensor<int>), The input matched indices
+           is 2D Tenosr<int32> with shape [N, P], If MatchIndices[i][j] is -1,
+           the j-th entity of column is not matched to any entity of row in
+           i-th instance.
+       negative_indices (Variable): The input negative example indices are
+           an optional input with shape [Neg, 1] and int32 type, where Neg is
+           the total number of negative example indices.
+       mismatch_value (float32): Fill this value to the mismatched location.
+    Returns:
+       out (Variable): The output is a 3D Tensor with shape [N, P, K],
+           N and P is the same as they are in `neg_indices`, K is the
+           same as it in input of X. If `match_indices[i][j]`.
+       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+    """
+    helper = LayerHelper('target_assign', **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    out_weight = helper.create_tmp_variable(dtype='float32')
+    helper.append_op(
+        type='target_assign',
+        inputs={
+            'X': input,
+            'MatchIndices': matched_indices,
+            'NegIndices': negative_indices
+        },
+        outputs={'Out': out,
+                 'OutWeight': out_weight},
+        attrs={'mismatch_value': mismatch_value})
+    return out, out_weight
+def ssd_loss(location,
+             confidence,
+             gt_box,
+             gt_label,
+             prior_box,
+             prior_box_var=None,
+             background_label=0,
+             overlap_threshold=0.5,
+             neg_pos_ratio=3.0,
+             neg_overlap=0.5,
+             loc_loss_weight=1.0,
+             conf_loss_weight=1.0,
+             match_type='per_prediction',
+             mining_type='max_negative',
+             sample_size=None):
+    """
+    **Multi-box loss layer for object dection algorithm of SSD**
+    This layer is to compute dection loss for SSD given the location offset
+    predictions, confidence predictions, prior boxes and ground-truth boudding
+    boxes and labels, and the type of hard example mining. The returned loss
+    is a weighted sum of the localization loss (or regression loss) and
+    confidence loss (or classification loss) by performing the following steps:
+    1. Find matched boundding box by bipartite matching algorithm.
+      1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+      1.2 Compute matched boundding box by bipartite matching algorithm.
+    2. Compute confidence for mining hard examples
+      2.1. Get the target label based on matched indices.
+      2.2. Compute confidence loss.
+    3. Apply hard example mining to get the negative example indices and update
+       the matched indices.
+    4. Assign classification and regression targets
+      4.1. Encoded bbox according to the prior boxes.
+      4.2. Assign regression targets.
+      4.3. Assign classification targets.
+    5. Compute the overall objective loss.
+      5.1 Compute confidence loss.
+      5.1 Compute localization loss.
+      5.3 Compute the overall weighted loss.
+    Args:
+        location (Variable): The location predictions are a 3D Tensor with
+            shape [N, Np, 4], N is the batch size, Np is total number of
+            predictions for each instance. 4 is the number of coordinate values,
+            the layout is [xmin, ymin, xmax, ymax].
+        confidence (Variable): The confidence predictions are a 3D Tensor
+            with shape [N, Np, C], N and Np are the same as they are in
+            `location`, C is the class number.
+        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        gt_label (Variable): The ground-truth labels are a 2D LoDTensor
+            with shape [Ng, 1].
+        prior_box (Variable): The prior boxes are a 2D Tensor with shape [Np, 4].
+        prior_box_var (Variable): The variance of prior boxes are a 2D Tensor
+            with shape [Np, 4].
+        background_label (int): The index of background label, 0 by default.
+        overlap_threshold (float): If match_type is 'per_prediction', use
+            `overlap_threshold` to determine the extra matching bboxes when
+             finding matched boxes. 0.5 by default.
+        neg_pos_ratio (float): The ratio of the negative boxes to the positive
+            boxes, used only when mining_type is max_negative, 3.0 by defalut.
+        neg_overlap (float): The negative overlap upper bound for the unmatched
+            predictions. Use only when mining_type is max_negative,
+            0.5 by default.
+        sample_size (int): The max sample size of negative box, used only when
+            mining_type is hard_example.
+        loc_loss_weight (float): Weight for localization loss, 1.0 by default.
+        conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
+        match_type (str): The type of matching method during training, should
+            be 'bipartite' or 'per_prediction'.
+        mining_type (str): The hard example mining type, should be 'hard_example'
+            or 'max_negative', now only support `max_negative`.
+    Returns:
+        Variable: The weighted sum of the localization loss and confidence loss,
+            with shape [N * Np, 1], N and Np are the same as they are
+            in `location`.
+    Raises:
+        ValueError: If mining_type is 'hard_example', now only support
+            mining type of `max_negative`.
+    Examples:
+        .. code-block:: python
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+            gt_box = layers.data(
+                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+            gt_label = layers.data(
+                name='gt_label', shape=[1], lod_level=1, dtype='float32')
+            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+    """
+    helper = LayerHelper('ssd_loss', **locals())
+    if mining_type != 'max_negative':
+        raise ValueError("Only support mining_type == max_negative now.")
+    num, num_prior, num_class = confidence.shape
+    def __reshape_to_2d(var):
+        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+    # 1. Find matched boundding box by prior box.
+    #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+    iou = iou_similarity(x=gt_box, y=prior_box)
+    #   1.2 Compute matched boundding box by bipartite matching algorithm.
+    matched_indices, matched_dist = bipartite_match(iou)
+    # 2. Compute confidence for mining hard examples
+    # 2.1. Get the target label based on matched indices
+    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    target_label, _ = target_assign(
+        gt_label, matched_indices, mismatch_value=background_label)
+    # 2.2. Compute confidence loss.
+    # Reshape confidence to 2D tensor.
+    confidence = __reshape_to_2d(confidence)
+    target_label = tensor.cast(x=target_label, dtype='int64')
+    target_label = __reshape_to_2d(target_label)
+    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
+    # 3. Mining hard examples
+    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    neg_indices = helper.create_tmp_variable(dtype='int32')
+    dtype = matched_indices.dtype
+    updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='mine_hard_examples',
+        inputs={
+            'ClsLoss': conf_loss,
+            'LocLoss': None,
+            'MatchIndices': matched_indices,
+            'MatchDist': matched_dist,
+        },
+        outputs={
+            'NegIndices': neg_indices,
+            'UpdatedMatchIndices': updated_matched_indices
+        },
+        attrs={
+            'neg_pos_ratio': neg_pos_ratio,
+            'neg_dist_threshold': neg_pos_ratio,
+            'mining_type': mining_type,
+            'sample_size': sample_size,
+        })
+    # 4. Assign classification and regression targets
+    # 4.1. Encoded bbox according to the prior boxes.
+    encoded_bbox = box_coder(
+        prior_box=prior_box,
+        prior_box_var=prior_box_var,
+        target_box=gt_box,
+        code_type='encode_center_size')
+    # 4.2. Assign regression targets
+    target_bbox, target_loc_weight = target_assign(
+        encoded_bbox, updated_matched_indices, mismatch_value=background_label)
+    # 4.3. Assign classification targets
+    target_label, target_conf_weight = target_assign(
+        gt_label,
+        updated_matched_indices,
+        negative_indices=neg_indices,
+        mismatch_value=background_label)
+    # 5. Compute loss.
+    # 5.1 Compute confidence loss.
+    target_label = __reshape_to_2d(target_label)
+    target_label = tensor.cast(x=target_label, dtype='int64')
+    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
+    target_conf_weight = __reshape_to_2d(target_conf_weight)
+    conf_loss = conf_loss * target_conf_weight
+    # 5.2 Compute regression loss.
+    location = __reshape_to_2d(location)
+    target_bbox = __reshape_to_2d(target_bbox)
+    loc_loss = nn.smooth_l1(location, target_bbox)
+    target_loc_weight = __reshape_to_2d(target_loc_weight)
+    loc_loss = loc_loss * target_loc_weight
+    # 5.3 Compute overall weighted loss.
+    loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
+    return loss
 def multi_box_head(inputs,
                   num_classes,
                   min_sizes=None,

--- a/python/paddle/v2/fluid/tests/test_detection.py
+++ b/python/paddle/v2/fluid/tests/test_detection.py
@@ -15,12 +15,11 @@
 from __future__ import print_function
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.layers.detection as detection
 from paddle.v2.fluid.framework import Program, program_guard
 import unittest
-class TestBook(unittest.TestCase):
+class TestDetection(unittest.TestCase):
    def test_detection_output(self):
        program = Program()
        with program_guard(program):
@@ -47,7 +46,67 @@ class TestBook(unittest.TestCase):
            out = layers.detection_output(
                scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv)
            self.assertIsNotNone(out)
-        # print(str(program))
+            self.assertEqual(out.shape[-1], 6)
+        print(str(program))
+    def test_detection_api(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[4], dtype='float32')
+            y = layers.data(name='y', shape=[4], dtype='float32')
+            z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
+            iou = layers.iou_similarity(x=x, y=y)
+            bcoder = layers.box_coder(
+                prior_box=x,
+                prior_box_var=y,
+                target_box=z,
+                code_type='encode_center_size')
+            self.assertIsNotNone(iou)
+            self.assertIsNotNone(bcoder)
+            matched_indices, matched_dist = layers.bipartite_match(iou)
+            self.assertIsNotNone(matched_indices)
+            self.assertIsNotNone(matched_dist)
+            gt = layers.data(
+                name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                gt, matched_indices, mismatch_value=0)
+            self.assertIsNotNone(trg)
+            self.assertIsNotNone(trg_weight)
+            gt2 = layers.data(
+                name='gt2', shape=[10, 4], dtype='float32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                gt2, matched_indices, mismatch_value=0)
+            self.assertIsNotNone(trg)
+            self.assertIsNotNone(trg_weight)
+        print(str(program))
+    def test_ssd_loss(self):
+        program = Program()
+        with program_guard(program):
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+            gt_box = layers.data(
+                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+            gt_label = layers.data(
+                name='gt_label', shape=[1], lod_level=1, dtype='int32')
+            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+            self.assertIsNotNone(loss)
+            self.assertEqual(loss.shape[-1], 1)
+        print(str(program))
 class TestPriorBox(unittest.TestCase):
@@ -68,7 +127,7 @@ class TestPriorBox(unittest.TestCase):
        conv4 = fluid.layers.conv2d(conv3, 3, 3, 2)
        conv5 = fluid.layers.conv2d(conv4, 3, 3, 2)
-        box, var = detection.prior_box(
+        box, var = layers.prior_box(
            inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
            image=images,
            min_ratio=20,