Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into crop_op

Conflicts: paddle/pybind/pybind.cc

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into crop_op
Conflicts: paddle/pybind/pybind.cc
b21aee63 · wanghaoshuang · a8584a99 · bc55c20f · b21aee63 · b21aee63
84 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,7 @@ cmake-build-*
 # generated while compiling
 python/paddle/v2/framework/core.so
+paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
 paddle/.timestamp

--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -26,9 +26,9 @@ set(IGNORE_PATTERN
    .*ImportanceSampler.*
    .*cblas\\.h.*
    .*\\.pb\\.txt
-    .*LtrDataProvider.*
    .*MultiDataProvider.*
-    .*pb.*)
+    .*pb.*
+    .*pybind.h)
 # add_style_check_target
 #

--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
+IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
-```python
-import paddle as pd
-x = var()
-y = var()
-cond = var()
-b = pd.create_ifop(inputs=[x], output_num=1)
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-out = b(cond)
-```
-If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
 ```python
 import paddle as pd
@@ -39,7 +21,7 @@ with b.false_block():
 out = b(cond)
 ```
-If only true_block is set in an IfElseOp, we can have a default value for false as:
+If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd

--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -34,7 +34,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
-实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
@@ -224,45 +224,15 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 ### 5. 编译
- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
+运行下面命令可以进行编译：
- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如，`mul_op` 依赖 `math_function`，需要在`CMakeLists.txt`中添加如下内容：
-    ```
+```
-    op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)		 +
+make mul_op
-    ```
+```
- 运行下面命令可以进行编译：
-    ```
-    make mul_op
-    ```
 ## 绑定Python
- 绑定Python
+系统会对新增的op自动绑定Python，并链接到生成的lib库中。
-    在 [`paddle/pybind/pybind.cc
-`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
-    ```
-    USE_OP(mul);
-    ```
-    如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
-    ```
-    USE_CPU_ONLY_OP(gather);
-    ```
-    如果OP不带Kernel，则使用`USE_NO_KENREL_OP`:
-    ```
-    USE_NO_KENREL_OP(recurrent);
-    ```
- - 生成库
-   `paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
 ## 实现单元测试
@@ -367,3 +337,10 @@ make test ARGS="-R test_mul_op -V"
 ```bash
 ctest -R test_mul_op
 ```
+## 注意事项
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
+- 如果Op没有实现GPU Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -51,18 +51,15 @@ bool operator==(const LoD& a, const LoD& b);
 * LoDTensor (Level of details Tensor)
 * see https://en.wikipedia.org/wiki/Level_of_details for reference.
 */
-class LoDTensor {
+class LoDTensor : public Tensor {
 public:
  LoDTensor() {}
-  LoDTensor(const LoD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
-  void set_lod(const LoD& lod) { lod_ = lod; }
+  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
-  void set_tensor(Tensor* tensor) { tensor_ = tensor; }
-  Tensor& tensor() { return *tensor_; }
+  void set_lod(const LoD& lod) { lod_ = lod; }
-  LoD lod() { return lod_; }
+  LoD lod() const { return lod_; }
  /*
   * Get a element from LoD.
@@ -104,7 +101,6 @@ class LoDTensor {
 private:
  LoD lod_;
-  Tensor* tensor_;  // not owned
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -36,69 +36,64 @@ class LoDTensorTester : public ::testing::Test {
    ASSERT_EQ(lod.size(), 3UL);
-    tensor.Resize({20 /*batch size*/, 128 /*dim*/});
+    lod_tensor_.Resize({20 /*batch size*/, 128 /*dim*/});
    // malloc memory
-    tensor.mutable_data<float>(place);
+    lod_tensor_.mutable_data<float>(place);
-    lod_tensor.set_lod(lod);
+    lod_tensor_.set_lod(lod);
-    lod_tensor.set_tensor(&tensor);
  }
 protected:
  platform::CPUPlace place;
-  Tensor tensor;
+  LoDTensor lod_tensor_;
-  LoDTensor lod_tensor;
 };
-TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
+TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
 TEST_F(LoDTensorTester, NumElements) {
-  ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1), 4UL);
-  ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
+  ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
 TEST_F(LoDTensorTester, SliceLevels) {
  // slice 1 level
  for (size_t level = 0; level < 3UL; ++level) {
-    LoDTensor new_lod_tensor = lod_tensor;
+    LoDTensor new_lod_tensor = lod_tensor_;
    new_lod_tensor.SliceLevels(level, level + 1);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
-    ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-              lod_tensor.tensor().data<float>());
  }
  // slice 2 level
  for (size_t level = 0; level < 2UL; ++level) {
-    LoDTensor new_lod_tensor = lod_tensor;
+    LoDTensor new_lod_tensor = lod_tensor_;
    new_lod_tensor.SliceLevels(level, level + 2);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1), lod_tensor.NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
-    ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+              lod_tensor_.NumElements(level + 1));
-              lod_tensor.tensor().data<float>());
+    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
  }
 }
 TEST_F(LoDTensorTester, SliceInLevel) {
  size_t level = 0;
-  LoDTensor new_lod_tensor = lod_tensor;
+  LoDTensor new_lod_tensor = lod_tensor_;
  new_lod_tensor.SliceInLevel(level, 0, 2);
  EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
-  ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-            lod_tensor.tensor().data<float>());
  level = 1;
-  new_lod_tensor = lod_tensor;
+  new_lod_tensor = lod_tensor_;
  new_lod_tensor.SliceInLevel(level, 0, 2);
  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
+  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-            lod_tensor.tensor().data<float>());
 }
 }  // namespace framework

--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -26,18 +26,16 @@ __global__ void test(size_t* a, int size) {
 }
 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::Tensor tensor;
  paddle::framework::LoDTensor lod_tensor;
  paddle::platform::GPUPlace place(0);
  paddle::framework::LoD src_lod;
  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
-  tensor.Resize({14, 16});
+  lod_tensor.Resize({14, 16});
-  tensor.mutable_data<float>(place);
+  lod_tensor.mutable_data<float>(place);
  lod_tensor.set_lod(src_lod);
-  lod_tensor.set_tensor(&tensor);
  CHECK_EQ(lod_tensor.lod_element(0, 2), 4);
  CHECK_EQ(lod_tensor.lod_element(0, 4), 8);

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -186,6 +186,48 @@ void OperatorBase::GenerateTemporaryNames() {
  }
 }
+template <>
+const Tensor* InferShapeContext::Input<Tensor>(const std::string& name) const {
+  auto* var = InputVar(name);
+  return var == nullptr ? nullptr : GetTensorFromVar(var);
+}
+template <>
+const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
+    const std::string& name) const {
+  auto names = op().Inputs(name);
+  std::vector<const Tensor*> res;
+  res.reserve(names.size());
+  std::transform(names.begin(), names.end(), std::back_inserter(res),
+                 [&](const std::string& sub_name) {
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr : GetTensorFromVar(var);
+                 });
+  return res;
+}
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
+  auto* var = OutputVar(name);
+  return var == nullptr ? nullptr : const_cast<Tensor*>(GetTensorFromVar(var));
+}
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const {
+  auto names = op().Outputs(name);
+  std::vector<Tensor*> res;
+  res.reserve(names.size());
+  std::transform(names.begin(), names.end(), std::back_inserter(res),
+                 [&](const std::string& sub_name) {
+                   auto var = scope().FindVar(sub_name);
+                   return var == nullptr
+                              ? nullptr
+                              : const_cast<Tensor*>(GetTensorFromVar(var));
+                 });
+  return res;
+}
 void OpProtoAndCheckerMaker::Validate() {
  validated_ = true;
  CheckNoDuplicatedInOutAttrs();

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "op_info.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
@@ -326,11 +327,27 @@ class InferShapeContext {
    return res;
  }
+  const Tensor* GetTensorFromVar(const Variable* var) const {
+    if (var->IsType<LoDTensor>()) {
+      return &var->Get<LoDTensor>();
+    }
+    PADDLE_ENFORCE(var->IsType<Tensor>(),
+                   "The Input(%s) must be LoDTensor or Tensor.");
+    return &var->Get<Tensor>();
+  }
 private:
  const OperatorBase& op_;
  const Scope& scope_;
 };
+template <>
+const Tensor* InferShapeContext::Input<Tensor>(const std::string& name) const;
+template <>
+const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
+    const std::string& name) const;
 template <typename T>
 struct EigenDeviceConverter;
@@ -363,9 +380,37 @@ class ExecutionContext : public InferShapeContext {
    return device_context_;
  }
+  // redefine Output function,
+  // use Variable::Get instead of Variable::GetMutable
+  template <typename T>
+  T* Output(const std::string& name) const {
+    auto var = OutputVar(name);
+    return var == nullptr ? nullptr : const_cast<T*>(&var->Get<T>());
+  }
+  // redefine MultiOutput function.
+  // use Variable::Get instead of Variable::GetMutable
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto names = op().Outputs(name);
+    std::vector<T*> res;
+    res.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(res),
+        [&](const std::string& sub_name) { return Output<T>(sub_name); });
+    return res;
+  }
  const platform::DeviceContext* device_context_;
 };
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const;
 class OpKernel {
 public:
  /**

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -22,7 +22,7 @@ namespace framework {
 template <typename T>
 inline void Tensor::check_memory_size() const {
  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
  PADDLE_ENFORCE_GE(
      holder_->size(), numel() * sizeof(T) + offset_,
      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -36,7 +36,7 @@ TEST(Tensor, DataAssert) {
  } catch (paddle::platform::EnforceNotMet err) {
    caught = true;
    std::string msg =
-        "holder_ should not be null\nTenosr holds no memory. Call "
+        "holder_ should not be null\nTensor holds no memory. Call "
        "Tensor::mutable_data first.";
    const char* what = err.what();
    for (size_t i = 0; i < msg.length(); ++i) {
@@ -112,7 +112,7 @@ TEST(Tensor, ShareDataWith) {
    } catch (paddle::platform::EnforceNotMet err) {
      caught = true;
      std::string msg =
-          "holder_ should not be null\nTenosr holds no memory. Call "
+          "holder_ should not be null\nTensor holds no memory. Call "
          "Tensor::mutable_data first.";
      const char* what = err.what();
      for (size_t i = 0; i < msg.length(); ++i) {
@@ -274,4 +274,4 @@ TEST(Tensor, ReshapeToMatrix) {
  Tensor res = ReshapeToMatrix<int>(src, 2);
  ASSERT_EQ(res.dims()[0], 2 * 3);
  ASSERT_EQ(res.dims()[1], 4 * 9);
 }
\ No newline at end of file
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNConvLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+namespace paddle {
+REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
+bool MKLDNNConvLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(config_.shared_biases()) << "Only support shared biases yet";
+  oc_ = config_.num_filters();
+  const ConvConfig& conf = config_.inputs(0).conv_conf();
+  ic_ = conf.channels();
+  fw_ = conf.filter_size();
+  fh_ = conf.filter_size_y();
+  pw_ = conf.padding();
+  ph_ = conf.padding_y();
+  dw_ = conf.dilation();
+  dh_ = conf.dilation_y();
+  sw_ = conf.stride();
+  sh_ = conf.stride_y();
+  gp_ = conf.groups();
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  caffeMode_ = conf.caffe_mode();
+  CHECK(caffeMode_) << "Only support caffe mode yet";
+  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
+  // check group setting
+  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
+  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
+  // create weight
+  size_t height = oc_ / gp_;
+  size_t width = ic_ * fh_ * fw_;
+  CHECK_EQ(parameters_[0]->getSize(), height * width);
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  return true;
+}
+void MKLDNNConvLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+  CHECK(wgtVal_) << "should have been initialized";
+  // the paddle weight format is oihw or goihw
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+void MKLDNNConvLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+void MKLDNNConvLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  // cal output sizes
+  // oc can not be changed
+  int fh = (fh_ - 1) * dh_ + 1;
+  int fw = (fw_ - 1) * dw_ + 1;
+  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
+  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
+                               MKLDNNMatrixPtr& in,
+                               MKLDNNMatrixPtr& wgt,
+                               MKLDNNMatrixPtr& bias,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdPD(fwdPD_);
+  resetFwdBuffers(fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  printValueFormatFlow();
+}
+void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
+                               MKLDNNMatrixPtr& in,
+                               MKLDNNMatrixPtr& wgt,
+                               MKLDNNMatrixPtr& bias,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
+  resetBwdWgtPD(bwdWgtPD);
+  resetBwdDataPD(bwdDataPD);
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  printGradFormatFlow();
+}
+void MKLDNNConvLayer::updateInputData() {
+  cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+}
+void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
+                                       memory::dims& bias,
+                                       memory::dims& stride,
+                                       memory::dims& dilation,
+                                       memory::dims& padL,
+                                       memory::dims& padR) {
+  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
+                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
+  bias = memory::dims{oc_};
+  stride = memory::dims{sh_, sw_};
+  padL = memory::dims{ph_, pw_};
+  padR = getPaddingR();
+  // note: mkldnn dilation start from 0
+  dilation = memory::dims{dh_ - 1, dw_ - 1};
+}
+void MKLDNNConvLayer::resetFwdPD(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
+  // dims for conv
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  conv_fwd::desc fwdDesc =
+      biases_ && biases_->getW()
+          ? conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(biasDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind)
+          : conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind);
+  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
+}
+void MKLDNNConvLayer::resetFwdBuffers(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(pd);
+  resetInValue(pd, in);
+  resetWgtBiasValue(pd, wgt, bias);
+  resetOutValue(pd, out);
+}
+void MKLDNNConvLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  if (cvtInVal_) {
+    pipeline.push_back(*cvtInVal_);
+  }
+  if (bias) {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+  if (cvtOutVal_) {
+    pipeline.push_back(*cvtOutVal_);
+  }
+}
+void MKLDNNConvLayer::resetInValue(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
+  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
+  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
+  // create buffer and reorder if input value do not match
+  cpuInVal_ = nullptr;
+  cvtInVal_ = nullptr;
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+    CHECK(dnnIn) << "Input should be MKLDNNMatrix";
+    if (dnnIn->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+      CHECK_EQ(dnnIn->getFormat(), format::nc);
+      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
+      // create a new one with nchw format and same data
+      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
+      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
+      CHECK(dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc());
+    }
+    in = dnnIn;
+  } else {
+    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
+    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+    cpuInVal_ = MKLDNNMatrix::create(cpuIn, inDims, format::nchw, engine_);
+    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+      // create new mkldnn matrix
+      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
+      cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
+      CHECK(cvtInVal_) << "should not be emptry";
+    } else {
+      in = cpuInVal_;
+    }
+  }
+}
+void MKLDNNConvLayer::resetWgtBiasValue(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias) {
+  wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
+  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
+  bias = (biases_ && biases_->getW())
+             ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
+             : nullptr;
+}
+void MKLDNNConvLayer::resetOutValue(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
+  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
+  // change original output value from cpu matrix to mkldnn matrix
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
+  // create reorder if output value has cpu device and pd do not match
+  cpuOutVal_ = nullptr;
+  cpuOutVal_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
+    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
+    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
+      CHECK(cvtOutVal_) << "should not be emptry";
+    } else {
+      // CPU output share the same data of MKLDNN output
+      cpuOut->setData(out->getData());
+      cpuOutVal_ = out;
+    }
+  }
+}
+void MKLDNNConvLayer::resetBwdWgtPD(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  // create backward weight using input, output and weight value memory desc
+  CHECK(inVal_) << "Should have input value";
+  CHECK(outVal_) << "Should have output value";
+  CHECK(wgtVal_) << "Should have weight value";
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  auto bwdWgtDesc = biasVal_ != nullptr
+                        ? conv_bwdWgt::desc(algo,
+                                            inVal_->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            biasVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind)
+                        : conv_bwdWgt::desc(algo,
+                                            inVal_->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind);
+  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+  CHECK(pd->src_primitive_desc() == inVal_->getPrimitiveDesc())
+      << "primitive desc of in value should equal";
+  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
+      << "primitive desc of out grad should equal the out value";
+  CHECK(pd->diff_weights_primitive_desc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad should equal the weight value";
+}
+void MKLDNNConvLayer::resetBwdDataPD(
+    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  pd = nullptr;
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
+    return;
+  }
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  CHECK(inVal_) << "Should have input value";
+  CHECK(outVal_) << "Should have output value";
+  // create backward data using input and output value memory desc
+  // but using weight memory desc with any format
+  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
+                                        inVal_->getMemoryDesc(),
+                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
+                                        outVal_->getMemoryDesc(),
+                                        strides,
+                                        padL,
+                                        padR,
+                                        padding_kind::zero);
+  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+  CHECK(pd->diff_src_primitive_desc() == inVal_->getPrimitiveDesc())
+      << "primitive desc of in grad should equal the in value";
+  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
+      << "primitive desc of out grad should equal";
+}
+void MKLDNNConvLayer::resetBwdBuffers(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(wgtPD);
+  resetOutGrad(wgtPD, out);
+  resetWgtBiasGrad(wgtPD, wgt, bias);
+  resetInGrad(dataPD, in);
+  resetWgtValBwdData(dataPD, wgtValBwdData_);
+}
+void MKLDNNConvLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  if (cvtOutGrad_) {
+    pipeline.push_back(*cvtOutGrad_);
+  }
+  // add bwdWgt handle
+  if (bias) {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+  if (dataPD == nullptr) {
+    return;
+  }
+  if (cvtWgtVal_) {
+    pipeline.push_back(*cvtWgtVal_);
+  }
+  // add bwdData handle
+  CHECK(wgtValBwdData_) << "Should have weight memory";
+  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
+  pipeline.push_back(*bwdData_);
+  if (cvtInGrad_) {
+    pipeline.push_back(*cvtInGrad_);
+  }
+}
+void MKLDNNConvLayer::resetOutGrad(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
+  const MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
+  CHECK(outVal_ != nullptr &&
+        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
+      << "primitive desc of out grad and value should be equal";
+  // TODO(TJ): merge outgrad
+  // create reorder if has output grad does not match
+  cpuOutGrad_ = nullptr;
+  cvtOutGrad_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // same PrimitiveDesc with cpuInVal_
+    CHECK(cpuOutVal_);
+    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
+    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
+      outMat->setData(cpuOut->getData());
+      out = cpuOutGrad_;
+    } else {
+      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
+      CHECK(cvtOutGrad_);
+    }
+  }
+}
+void MKLDNNConvLayer::resetWgtBiasGrad(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias) {
+  wgt = MKLDNNMatrix::create(weight_->getWGrad(),
+                             wgtPD->diff_weights_primitive_desc());
+  CHECK(nullptr != wgtVal_ &&
+        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad and value should be equal";
+  VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
+  bias = nullptr;
+  if (biasVal_ == nullptr) {
+    return;
+  }
+  bias = MKLDNNMatrix::create(biases_->getWGrad(),
+                              wgtPD->diff_bias_primitive_desc());
+  CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+      << "primitive desc of bias grad should equal the bias value";
+}
+void MKLDNNConvLayer::resetInGrad(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in) {
+  if (dataPD == nullptr) {
+    return;
+  }
+  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
+  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
+                            dataPD->diff_src_primitive_desc());
+  CHECK(nullptr != inVal_ &&
+        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+      << "primitive desc of input grad and value should be equal";
+  // create reorder if has output grad does not match
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
+  if (!inputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
+    // same PrimitiveDesc with cpuInVal_
+    CHECK(cpuInVal_);
+    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
+    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
+      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
+      CHECK(cvtInGrad_);
+    } else {
+      in = cpuInGrad_;
+    }
+  }
+}
+void MKLDNNConvLayer::resetWgtValBwdData(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& wgt) {
+  if (dataPD == nullptr) {
+    return;
+  }
+  // create new weight value for backward data, and create reorder if necessary
+  // since the primitive_desc would be different with wgtVal_
+  CHECK(wgtVal_) << "should have weight value";
+  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
+    wgtValBwdData_ =
+        MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
+    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
+    CHECK(cvtWgtVal_);
+  } else {
+    wgtValBwdData_ = wgtVal_;
+  }
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data"
+                    << wgtValBwdData_->getFormat();
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+typedef mkldnn::convolution_forward conv_fwd;
+typedef mkldnn::convolution_backward_weights conv_bwdWgt;
+typedef mkldnn::convolution_backward_data conv_bwdData;
+/**
+ * @brief A subclass of MKLDNNLayer conv layer.
+ *
+ * The config file api is mkldnn_conv
+ */
+class MKLDNNConvLayer : public MKLDNNLayer {
+protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // dilation height and width
+  int dh_, dw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+  // group number
+  int gp_;
+  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
+  MKLDNNMatrixPtr wgtValBwdData_;
+  // convert handle from wgtVal_ to wgtValBwdData_
+  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
+  // MKLDNNMatrixPtr which should be created from CPU Device
+  MKLDNNMatrixPtr cpuInVal_;
+  MKLDNNMatrixPtr cpuInGrad_;
+  MKLDNNMatrixPtr cpuOutVal_;
+  MKLDNNMatrixPtr cpuOutGrad_;
+  // convert handle between CPU device and MKLDNN device
+  std::shared_ptr<mkldnn::reorder> cvtInVal_;
+  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+  // whether the weight has been init
+  bool hasInitedWgt_;
+  // true by default, which impact the calculation of output image size.
+  // details can refer to mathUtil.h
+  bool caffeMode_;
+  // weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+public:
+  explicit MKLDNNConvLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
+  ~MKLDNNConvLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void updateInputData() override;
+  void updateWeights(const UpdateCallback& callback) override;
+  void convertWeightsFromPaddle() override;
+  void convertWeightsToPaddle() override;
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
+  }
+  void printValueFormatFlow() override {
+    if (cpuInVal_) {
+      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
+    }
+    MKLDNNLayer::printValueFormatFlow();
+    if (cpuOutVal_) {
+      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
+    }
+  }
+  void printGradFormatFlow() override {
+    if (cpuInGrad_) {
+      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
+    }
+    MKLDNNLayer::printGradFormatFlow();
+    if (cpuOutGrad_) {
+      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
+    }
+  }
+protected:
+  /**
+   * load the dims settings of this conv
+   */
+  void loadConvSettings(mkldnn::memory::dims& wgt,
+                        mkldnn::memory::dims& bias,
+                        mkldnn::memory::dims& stride,
+                        mkldnn::memory::dims& dilation,
+                        mkldnn::memory::dims& padL,
+                        mkldnn::memory::dims& padR);
+  /**
+   * reset the forward primitive descriptor.
+   */
+  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
+  /**
+   * reset the MKLDNNMatrix buffers used in forward.
+   */
+  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  /**
+   * reset the forward pipeline.
+   */
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * reset MKLDNNMatrix of input value
+   */
+  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                    MKLDNNMatrixPtr& in);
+  /**
+   * reset MKLDNNMatrix of weight and bias value
+   */
+  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                         MKLDNNMatrixPtr& wgt,
+                         MKLDNNMatrixPtr& bias);
+  /**
+   * reset MKLDNNMatrix of output value
+   */
+  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                     MKLDNNMatrixPtr& out);
+  /**
+   * reset the backward weight primitive descriptor.
+   */
+  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
+  /**
+   * reset the backward data primitive descriptor.
+   */
+  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
+  /**
+   * reset the MKLDNNMatrix buffers used in backward.
+   */
+  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  /**
+   * reset the backward pipeline.
+   */
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * reset MKLDNNMatrix of output grad
+   */
+  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                    MKLDNNMatrixPtr& out);
+  /**
+   * reset MKLDNNMatrix of weight and bias grad
+   */
+  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias);
+  /**
+   * reset MKLDNNMatrix of input grad
+   */
+  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                   MKLDNNMatrixPtr& in);
+  /**
+   * reset MKLDNNMatrix of weight value for backward data
+   * since the primitive_desc would be different with wgtVal_
+   */
+  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                          MKLDNNMatrixPtr& wgt);
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_convolution_forward_common.hpp
+   * @note: mkldnn dilation start from 0 while paddle start from 1
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
+        ++padR[0];
+      }
+      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -17,9 +17,6 @@ limitations under the License. */
 using namespace mkldnn;  // NOLINT
 typedef memory::format format;
-typedef inner_product_forward fc_fwd;
-typedef inner_product_backward_weights fc_bwdWgt;
-typedef inner_product_backward_data fc_bwdData;
 namespace paddle {
@@ -93,35 +90,88 @@ void MKLDNNFcLayer::reshape(
  printSizeInfo();
 }
-void MKLDNNFcLayer::resetFwd(std::vector<mkldnn::primitive>& pipeline,
+void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
                             MKLDNNMatrixPtr& in,
                             MKLDNNMatrixPtr& wgt,
                             MKLDNNMatrixPtr& bias,
                             MKLDNNMatrixPtr& out) {
-  pipeline.clear();
+  resetFwdBuffers(in, wgt, bias, out);
-  bool hasBias = biases_ && biases_->getW();
-  const MatrixPtr& wgtVal = weight_->getW();
+  resetFwdPD(fwdPD_, in, wgt, bias, out);
-  const MatrixPtr& biasVal = hasBias ? biases_->getW() : nullptr;
-  const MatrixPtr& outVal = output_.value;
+  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  printValueFormatFlow();
+}
+void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
+                             MKLDNNMatrixPtr& in,
+                             MKLDNNMatrixPtr& wgt,
+                             MKLDNNMatrixPtr& bias,
+                             MKLDNNMatrixPtr& out) {
+  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
+  resetBwdBuffers(in, wgt, bias, out);
+  resetBwdWgtPD(bwdWgtPD, wgt, bias, out);
+  resetBwdDataPD(bwdDataPD, in, out);
+  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  printGradFormatFlow();
+}
+void MKLDNNFcLayer::updateInputData() {
+  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+}
+void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  resetWgtBiasValue(wgt, bias);
+  resetOutValue(out);
+}
+void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& inVal = getInputValue(0);
+    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(inVal);
+    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
    CHECK(in) << "Input should be MKLDNNMatrix";
  } else {
    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& inVal = getInputValue(0, CPU_DEVICE);
+    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
    in = MKLDNNMatrix::create(
-        inVal, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
+        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
  }
  in->downSpatial();
+}
+void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
+                                      MKLDNNMatrixPtr& bias) {
  wgt = MKLDNNMatrix::create(
-      wgtVal, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+      weight_->getW(), {oc_, ic_, ih_, iw_}, format::oihw, engine_);
  wgt->downSpatial();
-  bias = hasBias ? MKLDNNMatrix::create(biasVal, {oc_}, format::x, engine_)
-                 : nullptr;
-  out = MKLDNNMatrix::create(outVal, {bs_, oc_}, format::nc, engine_);
+  bias = (biases_ && biases_->getW())
+             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
+             : nullptr;
+}
+void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
+  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
  // change original output value to mkldnn output value
  output_.value = std::dynamic_pointer_cast<Matrix>(out);
  if (!outputIsOnlyMKLDNN()) {
@@ -129,46 +179,59 @@ void MKLDNNFcLayer::resetFwd(std::vector<mkldnn::primitive>& pipeline,
    // just share point
    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
  }
+}
-  // create forward handle
+void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                               MKLDNNMatrixPtr in,
+                               MKLDNNMatrixPtr wgt,
+                               MKLDNNMatrixPtr bias,
+                               MKLDNNMatrixPtr out) {
+  CHECK(in);
+  CHECK(wgt);
+  CHECK(out);
  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
-                                                in->getMemoryDesc(),
+                                                        in->getMemoryDesc(),
-                                                wgt->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
-                                                bias->getMemoryDesc(),
+                                                        bias->getMemoryDesc(),
-                                                out->getMemoryDesc())
+                                                        out->getMemoryDesc())
-                                 : fc_fwd::desc(pk,
+                                         : fc_fwd::desc(pk,
-                                                in->getMemoryDesc(),
+                                                        in->getMemoryDesc(),
-                                                wgt->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
-                                                out->getMemoryDesc());
+                                                        out->getMemoryDesc());
-  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
-  if (hasBias) {
+}
-    fwd_.reset(new fc_fwd(fwdPD, *in, *wgt, *bias, *out));
+void MKLDNNFcLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  if (bias) {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
  } else {
-    fwd_.reset(new fc_fwd(fwdPD, *in, *wgt, *out));
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
  }
-  printValueFormatFlow();
  pipeline.push_back(*fwd_);
 }
-void MKLDNNFcLayer::resetBwd(std::vector<mkldnn::primitive>& pipeline,
+void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
-                             MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
-                             MKLDNNMatrixPtr& out) {
+  resetOutGrad(out);
-  pipeline.clear();
-  if (!needResetBwd_) {
+  resetWgtBiasGrad(wgt, bias);
-    return;
-  }
-  needResetBwd_ = false;
-  bool hasBias = biases_ && biases_->getWGrad();
-  /// backward weight
+  resetInGrad(in);
-  CHECK(inVal_) << "Should have input value";
+}
-  const MatrixPtr& wgtGrad = weight_->getWGrad();
-  const MatrixPtr& biasGrad = hasBias ? biases_->getWGrad() : nullptr;
+void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
  // TODO(TJ): merge outgrad
  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
  // for MKLDNN device:
@@ -178,66 +241,88 @@ void MKLDNNFcLayer::resetBwd(std::vector<mkldnn::primitive>& pipeline,
  // for CPU device:
  // fc do not need to convert from cpu device since output is always nc format
  // only need create from cpu device
-  const MatrixPtr& outGrad = getOutput(device).grad;
+  CHECK(outVal_);
-  out = MKLDNNMatrix::create(outGrad, outVal_->getPrimitiveDesc());
+  out =
-  wgt = MKLDNNMatrix::create(wgtGrad, wgtVal_->getPrimitiveDesc());
+      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
-  bias = hasBias ? MKLDNNMatrix::create(biasGrad, biasVal_->getPrimitiveDesc())
+}
-                 : nullptr;
+void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
-  // create memory primitive desc
+                                     MKLDNNMatrixPtr& bias) {
-  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
+  CHECK(wgtVal_);
-                                      inVal_->getMemoryDesc(),
+  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-                                      wgt->getMemoryDesc(),
-                                      out->getMemoryDesc());
+  bias = nullptr;
-  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+  if (biasVal_ == nullptr) {
-  fc_bwdWgt::desc bwdWgtDesc = hasBias
+    return;
-                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                     wgt->getMemoryDesc(),
-                                                     bias->getMemoryDesc(),
-                                                     out->getMemoryDesc())
-                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                     wgt->getMemoryDesc(),
-                                                     out->getMemoryDesc());
-  fc_bwdWgt::primitive_desc bwdWgtPD =
-      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
-  if (hasBias) {
-    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *out, *wgt));
  }
-  pipeline.push_back(*bwdWgt_);
+  bias =
+      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+}
-  /// backward data
+void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
+  in = nullptr;
  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
  if (inGrad == nullptr) {
    return;
  }
-  if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-    // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
+  CHECK(inVal_);
-  } else {
+  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
-    in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+}
-  }
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
-      inVal_->getMemoryDesc(), wgt->getMemoryDesc(), out->getMemoryDesc());
-  fc_bwdData::primitive_desc bwdDataPD =
-      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
-  CHECK(wgtVal_) << "Should have weight memory";
+void MKLDNNFcLayer::resetBwdWgtPD(
-  bwdData_.reset(new fc_bwdData(bwdDataPD, *out, *wgtVal_, *in));
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-  printGradFormatFlow();
+    MKLDNNMatrixPtr& wgt,
-  pipeline.push_back(*bwdData_);
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVal_);
+  fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                      wgt->getMemoryDesc(),
+                                                      bias->getMemoryDesc(),
+                                                      out->getMemoryDesc())
+                                    : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                      wgt->getMemoryDesc(),
+                                                      out->getMemoryDesc());
+  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
 }
-void MKLDNNFcLayer::updateInputData() {
+void MKLDNNFcLayer::resetBwdDataPD(
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_);
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
+      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
+  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
 }
-void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
+void MKLDNNFcLayer::resetBwdPipeline(
-  weight_->getParameterPtr()->incUpdate(callback);
+    std::vector<primitive>& pipeline,
-  if (biases_ && biases_->getWGrad()) {
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-    biases_->getParameterPtr()->incUpdate(callback);
+    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  CHECK(inVal_);
+  if (bias) {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+  if (bwdDataPD == nullptr) {
+    return;
  }
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
+  pipeline.push_back(*bwdData_);
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include "mkldnn.hpp"
 namespace paddle {
+typedef mkldnn::inner_product_forward fc_fwd;
+typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
+typedef mkldnn::inner_product_backward_data fc_bwdData;
 /**
 * @brief A subclass of MKLDNNLayer fc layer.
@@ -32,6 +35,9 @@ protected:
  // if has already init the weight
  bool hasInitedWgt_;
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
  // fc weight and bias
  std::unique_ptr<Weight> weight_;
  std::unique_ptr<Weight> biases_;
@@ -67,6 +73,59 @@ public:
  void convertWeightsFromPaddle() override;
  void convertWeightsToPaddle() override;
+protected:
+  /**
+   * Forward functions: reset buffers(input, output, weight and bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetInValue(MKLDNNMatrixPtr& in);
+  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
+  void resetOutValue(MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * Backward functions: reset buffers(input, output, weight and bias),
+   *                     reset primitive descriptor for backward weight,
+   *                     reset primitive descriptor for backward data,
+   *                     reset pipeline.
+   */
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetOutGrad(MKLDNNMatrixPtr& out);
+  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
+  void resetInGrad(MKLDNNMatrixPtr& in);
+  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+                     MKLDNNMatrixPtr& wgt,
+                     MKLDNNMatrixPtr& bias,
+                     MKLDNNMatrixPtr& out);
+  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+                      MKLDNNMatrixPtr& in,
+                      MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "MKLDNNTester.h"
 #include "ModelConfig.pb.h"
+#include "paddle/math/MathUtils.h"
 using namespace paddle;  // NOLINT
@@ -63,6 +64,83 @@ TEST(MKLDNNLayer, FcLayer) {
  testFcLayer({/*bs*/ 15, /*ic*/ 3, /*oc*/ 6, /*ih*/ 16, /*iw*/ 16});
 }
+struct testConvDesc {
+  int bs, gp;
+  int ic, ih, iw;
+  int oc, oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+  int dh, dw;
+};
+void testConvLayer(const testConvDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_conv", "exconv"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_num_filters(pm.oc);
+  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
+  // cfg.layerConfig.set_partial_sum(1); // TODO: check it
+  cfg.layerConfig.set_shared_biases(true);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_groups(pm.gp);
+  conv->set_img_size(pm.iw);
+  conv->set_img_size_y(pm.ih);
+  conv->set_output_x(pm.ow);
+  conv->set_output_y(pm.oh);
+  conv->set_filter_size(pm.fw);
+  conv->set_filter_size_y(pm.fh);
+  conv->set_channels(pm.ic);
+  conv->set_padding(pm.pw);
+  conv->set_padding_y(pm.ph);
+  conv->set_stride(pm.sw);
+  conv->set_stride_y(pm.sh);
+  conv->set_dilation(pm.dw);
+  conv->set_dilation_y(pm.dh);
+  conv->set_caffe_mode(true);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
+      << "it is indivisible";
+  int fh = (pm.fh - 1) * pm.dh + 1;
+  int fw = (pm.fw - 1) * pm.dw + 1;
+  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
+  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+  MKLDNNTester tester;
+  for (auto biasSize : {pm.oc, 0}) {
+    cfg.biasSize = biasSize;
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+TEST(MKLDNNLayer, ConvLayer) {
+  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
+  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
+  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
+  // with groups
+  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
+}
 // TODO(TJ): add branch test
 int main(int argc, char** argv) {

--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -49,6 +49,27 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
 }
+std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
+                                                     const MKLDNNMatrixPtr& dst,
+                                                     bool checkData) {
+  if (src == dst || src->getPrimitiveDesc() == dst->getPrimitiveDesc()) {
+    return nullptr;
+  }
+  if (checkData && (src->getData() == dst->getData())) {
+    LOG(FATAL) << "can not create reorder with inplace data";
+    return nullptr;
+  }
+  memory::dims srcDims = src->getDims();
+  memory::dims dstDims = dst->getDims();
+  CHECK_EQ(srcDims.size(), dstDims.size());
+  for (size_t i = 0; i < srcDims.size(); ++i) {
+    CHECK_EQ(srcDims[i], dstDims[i]);
+  }
+  return std::make_shared<reorder>(*src, *dst);
+}
 void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
                                   memory::format srcFmt,
                                   memory::dims targetDim) {

--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -52,6 +52,32 @@ public:
      mkldnn::engine& eg,
      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+  /**
+   * Create Memory descriptor.
+   * default with any format and f32 dtype
+   */
+  static mkldnn::memory::desc createMemoryDesc(
+      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::desc(dims, dtype, fmt);
+  }
+  /**
+   * Create reorder primitive.
+   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
+   * checkData: whether to check the data handle of src and dst.
+   *            if true, it will check the data and do not allow them equal;
+   *            otherwise, it will not check them, then the reorder created
+   *            may have inplace buffer.
+   *            Do not set false, if you can not guarantee the inplace logical
+   *            would work with your reorder.
+   */
+  static std::shared_ptr<mkldnn::reorder> createReorder(
+      const MKLDNNMatrixPtr& src,
+      const MKLDNNMatrixPtr& dst,
+      bool checkData = true);
 public:
  /**
   * Reorder this MKLDNNMatrix from other format.

--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -62,6 +62,24 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
  }
 }
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+}
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+}
 #endif  // PADDLE_ONLY_CPU
 }  // namespace memory

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
@@ -7,10 +9,11 @@ function(op_library TARGET)
    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
    set(cc_srcs)
    set(cu_srcs)
-    set(op_common_deps operator op_registry)
+    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
+    set(pybind_flag 0)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
            "${multiValueArgs}" ${ARGN})
@@ -46,22 +49,42 @@ function(op_library TARGET)
        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    endif()
+    # net_op doesn't need pybind
+    if ("${TARGET}" STREQUAL "net_op")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_NO_KERNEL_OP
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "OperatorWithKernel" regex_result "${TARGET_CONTENT}")
+    string(REPLACE "_op" "" TARGET "${TARGET}")
+    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_CPU_ONLY_OP
+    list(LENGTH cu_srcs cu_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+    # pybind USE_OP
+    if (${pybind_flag} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+    endif()
 endfunction()
 add_subdirectory(math)
 set(DEPS_OPS
-    identity_op
-    minus_op
-    mul_op
    recurrent_op
-    scale_op)
+    cond_op)
-op_library(identity_op DEPS scale_op)
-op_library(minus_op DEPS scale_op)
-op_library(mul_op DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
-  DEPS framework_proto tensor operator net_op)
+  DEPS framework_proto tensor net_op)
-op_library(scale_op DEPS net_op)
+op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -23,10 +23,15 @@ class AccuracyOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Inference"),
+    PADDLE_ENFORCE_NOT_NULL(
-                            "Input of Inference must be initialized.");
+        ctx.InputVar("Inference"),
+        "Input(Inference) of AccuracyOp should not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input of Inference must be initialized.");
+                            "Input(Label) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Accuracy"),
+        "Output(Accuracy) of AccuracyOp should not be null.");
    auto *inference = ctx.Input<framework::Tensor>("Inference");
    auto *label = ctx.Input<framework::Tensor>("Label");
@@ -34,7 +39,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
                      "inference size must be the same as label size");
-    ctx.Output<Tensor>("Accuracy")->Resize({1});
+    ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
  }
 };

--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -23,10 +23,18 @@ class AddOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of AddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of AddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of AddOp should not be null.");
    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                      ctx.Input<Tensor>("Y")->dims(),
                      "Two input of Add Op's dimension must be same.");
-    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::LoDTensor>("Out")->Resize(
+        ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -25,8 +25,11 @@ class ConcatOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ConcatOp should not be null.");
    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
    size_t n = ins.size();

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/cond_op.h"
+#include <cstring>
+#include <sstream>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/gather.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/scatter.h"
+namespace paddle {
+namespace operators {
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+void CondOp::CreateScope(const Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto& sub_scope = scope.NewScope();
+  sub_scopes->push_back(&sub_scope);
+}
+void CondOp::CreateIndexTensor(const Scope& scope) const {
+  auto index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
+  auto& index_tensors =
+      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
+  index_tensors.push_back(LoDTensor());
+}
+void CondOp::InferShape(const Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
+  for (int i = 0; i < 2; ++i) {
+    // Create two sub scopes for true and false branches
+    // sub_scopes[0] for the true branch and sub_scopes[1] for the false
+    // branch
+    CreateScope(scope);
+    // Create two tensors for true and false indices
+    // index_tensors[0] for the true branch and index_tensors[1] for the false
+    // branch
+    CreateIndexTensor(scope);
+    PADDLE_ENFORCE(!Inputs("Xs").empty(),
+                   "Inputs(Xs) of CondOp can't be empty.");
+    for (auto& input : Inputs("Xs")) {
+      // Create a new tensor in sub-scope for input-type tensor
+      Variable* v = sub_scopes[i]->NewVar(input);
+      LoDTensor* sub_input = v->GetMutable<LoDTensor>();
+      sub_input->Resize(scope.FindVar(input)->GetMutable<LoDTensor>()->dims());
+    }
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->NewVar(var_name);
+      }
+    }
+    // each net calls InferShape
+    sub_net_op_[i]->InferShape(*sub_scopes[i]);
+  }
+  for (auto& output : Outputs("Outs")) {
+    LoDTensor* tensor_t_out =
+        sub_scopes[0]->FindVar(output)->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
+    LoDTensor* tensor_f_out =
+        sub_scopes[1]->FindVar(output)->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
+    auto* tensor_out_var = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
+    LoDTensor* tensor_out = tensor_out_var->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+    // check output size should be same
+    PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
+                      "Outputs not of the same shape");
+    tensor_out->Resize(tensor_t_out->dims());
+    // tensor_out->mutable_data<float>(tensor_out->dims(),
+    // platform::CPUPlace());
+    tensor_out->mutable_data<float>(platform::CPUPlace());
+  }
+}
+void CondOp::Run(const Scope& scope,
+                 const platform::DeviceContext& dev_ctx) const {
+  auto* sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
+  auto* index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
+  auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>();
+  std::string cond_name = Input("Cond");
+  Variable* cond_var = scope.FindVar(cond_name);
+  PADDLE_ENFORCE_NOT_NULL(cond_var,
+                          "Input(Cond) of CondOp should not be null.");
+  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
+  // Step 1: get the true/false index at runtime
+  // index_[0]: vector<int>, contains all index for cond[i] == true
+  // index_[1]: vector<int>, contains all index for cond[i] == false
+  for (int i = 0; i < 2; ++i) index_[i].clear();
+  const int* cond_data = cond->data<int>();
+  for (int i = 0; i < cond->dims()[0]; ++i) {
+    if (cond_data[i])
+      index_[0].push_back(i);
+    else
+      index_[1].push_back(i);
+  }
+  // put index_[0] and index_[1] into two tensors:
+  // index_tensor_[0] and index_tensor_[1]
+  DDim dim = paddle::framework::make_ddim({0});
+  for (int i = 0; i < 2; ++i) {
+    dim[0] = index_[i].size();
+    int* tmp_ptr =
+        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
+    index_tensors[i].Resize(dim);
+    memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
+  }
+  // Step 2: collect data by calling gather
+  for (int i = 0; i < 2; ++i) {
+    // i= 0/i for True and False branches respectively
+    for (auto& input : Inputs("Xs")) {
+      // find Tensor
+      Variable* v = scope.FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
+      v = sub_scopes[i]->FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
+      // Resize child
+      DDim dim = tensor_child->dims();
+      dim[0] = index_[i].size();
+      tensor_child->Resize(dim);
+      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
+      Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
+                    tensor_child);
+    }
+  }
+  // Step 3: run
+  for (int i = 0; i < 2; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  }
+  // Step 4: merge output results
+  PADDLE_ENFORCE(!Outputs("Outs").empty(),
+                 "Outputs(Outs) of CondOp can't be empty.");
+  for (int i = 0; i < 2; ++i) {
+    // i= 0/i for True and False branches respectively
+    for (auto& output : Outputs("Outs")) {
+      // find Tensor
+      Variable* v = scope.FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
+      v = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(v);
+      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
+      ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
+                           tensor_parent);
+    }
+  }
+}
+class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Cond", "The condition, which is a bool vector");
+    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
+    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
+    AddOutput("SubScopes", "sub scopes for true and false branches");
+    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
+    AddComment(R"DOC(
+Sample dependent Cond Operator:
+Given Cond[i] as a 1/0 vector to indicate true/false
+The equation is: 
+Out[i] = subnet_t[i], if Cond[i] == true
+Out[i] = subnet_t[i], if Cond[i] == false
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
+                             paddle::operators::CondOpProtoAndCheckerMaker);
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/net_op.h"
+namespace paddle {
+namespace operators {
+/*
+ * @brief CondOp is a dynamic if-else Operator
+ *
+ * It has a input tensor named cond indicating which netop each instance will
+ * run.
+ *
+ * if cond == 1, it will run true_net, which is a NetOp.
+ *
+ * if cond == 0, it will run false_net, which is another NetOp.
+ */
+class CondOp : public framework::OperatorBase {
+ public:
+  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    index_.resize(2);
+    sub_net_op_.resize(2);
+  }
+  CondOp(const CondOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement copy ctor well.
+    PADDLE_THROW("Not implemented");
+  }
+  void CreateScope(const framework::Scope& scope) const;
+  void CreateIndexTensor(const framework::Scope& scope) const;
+  /*
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const override;
+  /*
+   * Set True Block
+   */
+  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
+    sub_net_op_[0] = std::move(net);
+  }
+  /*
+   * Set False Block
+   */
+  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
+    sub_net_op_[1] = std::move(net);
+  }
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override;
+ private:
+  // sub_net_op_[0]: subnet_t
+  // sub_net_op_[1]: subnet_f
+  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
+  // index_[0]: True_index;
+  // index_[1]: False_index;
+  mutable std::vector<std::vector<int>> index_;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -25,16 +25,38 @@ class CosSimOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    // notnull check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                            "Input(X) of CosSimOp should not be null.");
-                      ctx.Input<Tensor>("Y")->dims(),
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                      "Dimensions of Input(X) and Input(Y) must be the same.");
+                            "Input(Y) of CosSimOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-    auto dims = ctx.Input<Tensor>("X")->dims();
+                            "Output(Out) of CosSimOp should not be null.");
-    ctx.Output<Tensor>("Out")->Resize({dims[0], 1});
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("XNorm"),
-    ctx.Output<Tensor>("XNorm")->Resize({dims[0], 1});
+                            "Output(XNorm) of CosSimOp should not be null.");
-    ctx.Output<Tensor>("YNorm")->Resize({dims[0], 1});
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("YNorm"),
+                            "Output(YNorm) of CosSimOp should not be null.");
+    // shape check
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+    // resize tensor
+    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
+    ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
  }
 };
@@ -42,16 +64,27 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of cos_sim op.");
+    AddInput("X", "The 1st input of cos_sim op.");
-    AddInput("Y", "The second input of cos_sim op.");
+    AddInput("Y", "The 2nd input of cos_sim op.");
    AddOutput("Out", "The output of cos_sim op.");
-    AddOutput("XNorm", "Row norm of the first input.").AsIntermediate();
+    AddOutput("XNorm",
-    AddOutput("YNorm", "Row norm of the second input.").AsIntermediate();
+              "Norm of the first input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
+    AddOutput("YNorm",
+              "Norm of the second input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
    AddComment(R"DOC(
 Cosine Similarity Operator.
-The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y))
+The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
+Input(X) and Input(Y) must have the same shape, except that the 1st dimension
+of Input(Y) could be just 1 (different from Input(X)), which will be
+broadcasted to match the shape of Input(X) before computing their cosine
+similarity.
 )DOC");
  }
 };
@@ -62,34 +95,54 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    // notnull check
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
                            "Input(XNorm) must not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
                            "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Out"),
+                            "Input(Out) must not be null.");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) must not be null.");
+    // shape check
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto y_dims = ctx.Input<Tensor>("Y")->dims();
    auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
    auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto out_dims = ctx.Input<Tensor>("Out")->dims();
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
+    auto out_grad_dims =
-                      "Dimensions of Input(X) and Input(Y) must be the same.");
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    PADDLE_ENFORCE_EQ(xnorm_dims[0], x_dims[0],
-                      "1st dimension of XNorm must equal that of Input(X).");
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-    PADDLE_ENFORCE_EQ(xnorm_dims[1], 1, "2st dimension of XNorm must be one.");
+                      "Ranks of Input(X) and Input(Y) must be equal.");
-    PADDLE_ENFORCE_EQ(ynorm_dims[0], y_dims[0],
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "1st dimension of YNorm must equal that of Input(Y).");
+                      "Rank of Input(X) must not be less than 2.");
-    PADDLE_ENFORCE_EQ(ynorm_dims[1], 1, "2st dimension of YNorm must be one.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
-    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
-                      "1st dimension of Out@GRAD must equal that of Input(X)");
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
-    PADDLE_ENFORCE_EQ(out_dims[1], 1, "1st dimension of Out@GRAD must be one.");
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
-    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+                   " just 1 (which will be broadcasted to match Input(X)).");
+    auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1});
+    auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1});
+    PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims,
+                      "Shape of Input(XNorm) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims,
+                      "Shape of Input(YNorm) must be [Y.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims,
+                      "Shape of Input(Out) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims,
+                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
+    // resize tensor
+    auto *x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *y_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
    if (x_grad) x_grad->Resize(x_dims);
    if (y_grad) y_grad->Resize(y_dims);
  }

--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -31,30 +31,38 @@ template <typename Place, typename T>
 class CosSimKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x = context.Input<Tensor>("X");
+    // get Tensor
-    auto* input_y = context.Input<Tensor>("Y");
+    auto* in_x = context.Input<Tensor>("X");
-    auto* output_z = context.Output<Tensor>("Out");
+    auto* in_y = context.Input<Tensor>("Y");
-    auto* output_x_norm = context.Output<Tensor>("XNorm");
+    auto* out_z = context.Output<Tensor>("Out");
-    auto* output_y_norm = context.Output<Tensor>("YNorm");
+    auto* out_x_norm = context.Output<Tensor>("XNorm");
+    auto* out_y_norm = context.Output<Tensor>("YNorm");
+    out_z->mutable_data<T>(context.GetPlace());
+    out_x_norm->mutable_data<T>(context.GetPlace());
+    out_y_norm->mutable_data<T>(context.GetPlace());
-    output_z->mutable_data<T>(context.GetPlace());
+    // convert Tensor to Eigen Tensor
-    output_x_norm->mutable_data<T>(context.GetPlace());
+    int rows_x = in_x->dims()[0];
-    output_y_norm->mutable_data<T>(context.GetPlace());
+    int rows_y = in_y->dims()[0];
+    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto dims = input_x->dims();
+    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    int64_t size = input_x->numel();
+    auto z = EigenVector<T>::Flatten(*out_z);
-    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x_norm = EigenVector<T>::Flatten(*out_x_norm);
-    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
-    auto y = EigenMatrix<T>::From(*input_y, new_dims);
-    auto z = EigenVector<T>::Flatten(*output_z);
-    auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
-    auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
+    // compute
    auto place = context.GetEigenDevice<Place>();
-    auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
+    auto row_along = Eigen::array<int, 1>({{1}});
-    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    x_norm.device(place) = x.square().sum(row_along).sqrt();
-    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    y_norm.device(place) = y.square().sum(row_along).sqrt();
-    z.device(place) = xy / x_norm / y_norm;
+    if (rows_x == rows_y) {
+      auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
+      z.device(place) = xy / x_norm / y_norm;
+    } else {
+      Eigen::DSizes<int, 2> bcast(rows_x, 1);
+      auto xy = (x * y.broadcast(bcast)).sum(row_along);
+      z.device(place) = xy / x_norm / y_norm.broadcast(bcast);
+    }
  }
 };
@@ -62,43 +70,72 @@ template <typename Place, typename T>
 class CosSimGradKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x = context.Input<Tensor>("X");
+    // get Tensor
-    auto* input_y = context.Input<Tensor>("Y");
+    auto* in_x = context.Input<Tensor>("X");
-    auto* input_z = context.Input<Tensor>("Out");
+    auto* in_y = context.Input<Tensor>("Y");
-    auto* input_x_norm = context.Input<Tensor>("XNorm");
+    auto* in_z = context.Input<Tensor>("Out");
-    auto* input_y_norm = context.Input<Tensor>("YNorm");
+    auto* in_x_norm = context.Input<Tensor>("XNorm");
-    auto* output_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* in_y_norm = context.Input<Tensor>("YNorm");
-    auto* output_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto dims = input_x->dims();
+    // convert Tensor to Eigen Tensor
-    int64_t size = input_x->numel();
+    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
-    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
-    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto z = EigenMatrix<T>::Reshape(*in_z, 1);
-    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto x_norm = EigenMatrix<T>::Reshape(*in_x_norm, 1);
-    auto z = EigenMatrix<T>::From(*input_z);
+    auto y_norm = EigenMatrix<T>::Reshape(*in_y_norm, 1);
-    auto x_norm = EigenMatrix<T>::From(*input_x_norm);
+    auto dz = EigenMatrix<T>::Reshape(*in_grad_z, 1);
-    auto y_norm = EigenMatrix<T>::From(*input_y_norm);
-    auto dz = EigenMatrix<T>::From(*input_grad_z);
-    Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
+    // compute gradident
-    auto z_bcast = z.broadcast(bcast);
+    int rows_x = in_x->dims()[0];
-    auto dz_bcast = dz.broadcast(bcast);
+    int rows_y = in_y->dims()[0];
+    int cols = framework::product(in_x->dims()) / rows_x;
+    Eigen::DSizes<int, 2> bcast_cols(1, cols);
+    auto z_bcast = z.broadcast(bcast_cols);
+    auto dz_bcast = dz.broadcast(bcast_cols);
+    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
    auto place = context.GetEigenDevice<Place>();
-    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast);
+    if (rows_x == rows_y) {
-    auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast);
+      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
-    auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast);
+      auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
-    if (output_grad_x) {
+      // compute dx
-      output_grad_x->mutable_data<T>(context.GetPlace());
+      if (out_grad_x) {
-      auto dx = EigenMatrix<T>::From(*output_grad_x, new_dims);
+        out_grad_x->mutable_data<T>(context.GetPlace());
-      dx.device(place) =
+        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
-          dz_bcast * (y / norm_prod_bcast - z_bcast * x / x_snorm_bcast);
+        auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
-    }
+        dx.device(place) = dz_bcast * grad;
-    if (output_grad_y) {
+      }
-      output_grad_y->mutable_data<T>(context.GetPlace());
+      // compute dy
-      auto dy = EigenMatrix<T>::From(*output_grad_y, new_dims);
+      if (out_grad_y) {
-      dy.device(place) =
+        out_grad_y->mutable_data<T>(context.GetPlace());
-          dz_bcast * (x / norm_prod_bcast - z_bcast * y / y_snorm_bcast);
+        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast;
+        dy.device(place) = dz_bcast * grad;
+      }
+    } else {
+      Eigen::DSizes<int, 2> bcast_rows(rows_x, 1);
+      Eigen::DSizes<int, 2> bcast_rows_cols(rows_x, cols);
+      auto y_bcast = y.broadcast(bcast_rows);
+      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols);
+      auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows))
+                                 .eval()
+                                 .broadcast(bcast_cols);
+      // compute dx
+      if (out_grad_x) {
+        out_grad_x->mutable_data<T>(context.GetPlace());
+        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
+        auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
+        dx.device(place) = dz_bcast * grad;
+      }
+      // compute dy
+      if (out_grad_y) {
+        out_grad_y->mutable_data<T>(context.GetPlace());
+        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
+        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
+      }
    }
  }
 };

--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -141,17 +141,23 @@ template <typename T>
 class CropCPUKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
+    LOG(INFO) << "CropCPUKernel step1";
    auto *x = context.Input<Tensor>("X");
+    LOG(INFO) << "CropCPUKernel step2";
    auto *out = context.Output<Tensor>("Out");
+    LOG(INFO) << "CropCPUKernel step3";
    auto x_data = x->data<T>();
    T *out_data = out->mutable_data<T>(paddle::platform::CPUPlace());
+    LOG(INFO) << "CropCPUKernel step4";
    auto x_dims = x->dims();
    auto out_dims = out->dims();
+    LOG(INFO) << "CropCPUKernel step5";
    int64_t out_count = framework::product(out_dims);
    std::vector<int64_t> x_shape = framework::vectorize(x_dims);
    std::vector<int64_t> out_shape = framework::vectorize(out_dims);
    auto offsets = context.op().Attr<std::vector<int>>("offsets");
+    LOG(INFO) << "CropCPUKernel step6";
    PADDLE_ENFORCE_EQ(
        x_dims.size(), offsets.size(),
        "Offsets size should be equal to dimension size of input tensor.");
@@ -165,6 +171,7 @@ class CropCPUKernel : public framework::OpKernel {
    for (int64_t i = 0; i < out_count; ++i) {
      out_data[i] = x_data[transIndex(out_shape, x_shape, crop_rules, i)];
    }
+    LOG(INFO) << "CropCPUKernel step7";
  }
 };

--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -48,6 +48,7 @@ template <typename T, int D>
 void CropCUDAFunctoin(const framework::ExecutionContext& context) {
  PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
                 "It must use GPUPlace.");
+  LOG(INFO) << "CropCUDAFunctoin step1";
  auto* x = context.Input<Tensor>("X");
  auto* out = context.Output<Tensor>("Out");
  auto x_data = x->data<T>();

--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -25,13 +25,19 @@ class ElementWiseMulOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+                            "Input(X) of ElementWiseMulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of ElementWiseMulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of ElementWiseMulOp should not be null.");
    auto x_dim = ctx.Input<Tensor>("X")->dims();
    auto y_dim = ctx.Input<Tensor>("Y")->dims();
    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                      "Rank of first input must >= rank of second input.")
-    ctx.Output<Tensor>("Out")->Resize(x_dim);
+    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
  }
 };
@@ -80,8 +86,10 @@ class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto y_dims = ctx.Input<Tensor>("Y")->dims();
    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *x_grad =
-    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *y_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                      "Rank of first input must >= rank of second input.")

--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,10 +13,8 @@
   limitations under the License. */
 #pragma once
-#include <iostream>
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -23,7 +23,14 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::Tensor>("Dst")->Resize(
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("Src"),
+        "Input(Src) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Dst"),
+        "Output(Dst) of FillZerosLikeOp should not be null.");
+    ctx.Output<framework::LoDTensor>("Dst")->Resize(
        ctx.Input<framework::Tensor>("Src")->dims());
  }
 };

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -24,11 +24,18 @@ class GatherOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of GatherOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Index"),
+                            "Input(Index) of GatherOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of GatherOp should not be null.");
    int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
    output_dims[0] = batch_size;
-    ctx.Output<Tensor>("Out")->Resize(output_dims);
+    ctx.Output<framework::LoDTensor>("Out")->Resize(output_dims);
  }
 };
@@ -38,7 +45,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    auto X = ctx.Input<Tensor>("X");
    X_grad->Resize(X->dims());

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -43,8 +43,12 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
-  void InferShape(const framework::InferShapeContext& context) const override {
+  void InferShape(const framework::InferShapeContext& ctx) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of GaussianRandomOp should not be null.");
+    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
    auto dims = Attr<std::vector<int>>("dims");
    std::vector<int64_t> temp;
    temp.reserve(dims.size());

--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
@@ -42,6 +42,11 @@ class IdentityOp : public NetOp {
             const framework::VariableNameMap &outputs,
             const framework::AttributeMap &attrs)
      : NetOp(type, inputs, outputs, attrs) {
+    PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
+                      "Input(X) of IdentityOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
+                      "Output(Out) of IdentityOp should not be null.");
    AppendOp(framework::OpRegistry::CreateOp(
        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
        {{"scale", static_cast<AttrType>(1)}}));

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -22,10 +22,17 @@ class LookupTableOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
-  void InferShape(const framework::InferShapeContext &context) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto table_t = context.Input<Tensor>("W");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("W"),
-    auto ids_t = context.Input<Tensor>("Ids");
+                            "Input(W) of LookupTableOp should not be null.");
-    auto output_t = context.Output<Tensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Ids"),
+                            "Input(Ids) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of LookupTableOp should not be null.");
+    auto table_t = ctx.Input<Tensor>("W");
+    auto ids_t = ctx.Input<Tensor>("Ids");
+    auto output_t = ctx.Output<framework::LoDTensor>("Out");
    output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
  }
@@ -56,7 +63,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &context) const override {
    auto table = context.Input<Tensor>("W");
-    auto d_table = context.Output<Tensor>(framework::GradVarName("W"));
+    auto d_table =
+        context.Output<framework::LoDTensor>(framework::GradVarName("W"));
    d_table->Resize(table->dims());
  }
 };

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -24,8 +24,10 @@ class MeanOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input of MeanOp must be initialized.");
+                            "Input(X) of MeanOp should not be null.");
-    ctx.Output<Tensor>("Out")->Resize({1});
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of MeanOp should not be null.");
+    ctx.Output<framework::LoDTensor>("Out")->Resize({1});
  }
 };
@@ -45,7 +47,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>(framework::GradVarName("X"))
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -27,13 +27,20 @@ class MinusOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of MinusOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of MinusOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of MinusOp should not be null.");
    auto *left_tensor = ctx.Input<framework::Tensor>("X");
    auto *right_tensor = ctx.Input<framework::Tensor>("Y");
    PADDLE_ENFORCE_EQ(
        left_tensor->numel(), right_tensor->numel(),
        "Minus operator must take two tensor with same num of elements");
-    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+    ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
  }
 };
@@ -77,8 +84,6 @@ class MinusGradOp : public NetOp {
 }  // namespace operators
 }  // namespace paddle
-USE_OP(scale);
-USE_NO_KERNEL_OP(identity);
 namespace ops = paddle::operators;
 REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
            ops::MinusGradOp<float>);

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
+using framework::LoDTensor;
 class MulOp : public framework::OperatorWithKernel {
 public:
@@ -25,6 +26,13 @@ class MulOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of MulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of MulOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of MulOp should not be null.");
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto y_dims = ctx.Input<Tensor>("Y")->dims();
    int x_num_col_dims = Attr<int>("x_num_col_dims");
@@ -45,7 +53,8 @@ class MulOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(
        x_mat_dims[1], y_mat_dims[0],
        "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<Tensor>("Out")->Resize({x_mat_dims[0], y_mat_dims[1]});
+    ctx.Output<framework::LoDTensor>("Out")->Resize(
+        {x_mat_dims[0], y_mat_dims[1]});
  }
 };
@@ -94,8 +103,10 @@ class MulOpGrad : public framework::OperatorWithKernel {
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto y_dims = ctx.Input<Tensor>("Y")->dims();
    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *x_grad =
-    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *y_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
    auto x_mat_dims =
        framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/operators/onehot_cross_entropy_op.h"
 namespace paddle {
 namespace operators {
@@ -23,13 +23,23 @@ class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"),
+        "Input(X) of OnehotCrossEntropyOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("label"),
+        "Input(label) of OnehotCrossEntropyOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Y"),
+        "Output(Y) of OnehotCrossEntropyOp should not be null.");
    auto *X = ctx.Input<Tensor>("X");
    auto *label = ctx.Input<Tensor>("label");
    PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2.");
    PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
    PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
-    ctx.Output<Tensor>("Y")->Resize({X->dims()[0]});
+    ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
  }
 };
@@ -39,7 +49,7 @@ class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dX = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    auto X = ctx.Input<Tensor>("X");
    dX->Resize(X->dims());

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -25,6 +25,11 @@ class PadOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of PadOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of PadOp should not be null.");
    auto x_dim = ctx.Input<Tensor>("X")->dims();
    auto paddings = Attr<std::vector<int>>("paddings");
    PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
@@ -34,7 +39,8 @@ class PadOp : public framework::OperatorWithKernel {
    for (int i = 0; i < x_dim.size(); ++i) {
      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
    }
-    ctx.Output<Tensor>("Out")->Resize(framework::make_ddim(out_dims));
+    ctx.Output<framework::LoDTensor>("Out")->Resize(
+        framework::make_ddim(out_dims));
  }
 };
@@ -95,9 +101,9 @@ class PadOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) should not be null");
    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *x_g = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    if (x_grad != nullptr) {
+    if (x_g != nullptr) {
-      x_grad->Resize(x_dims);
+      x_g->Resize(x_dims);
    }
  }
 };

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -26,10 +26,11 @@ namespace operators {
 using Scope = framework::Scope;
 using Variable = framework::Variable;
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 void RecurrentAlgorithm::InferShape(const Scope& scope) const {
  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
-                 ->GetMutable<Tensor>()
+                 ->GetMutable<LoDTensor>()
                 ->dims()[0];
  CreateScopes(scope);
  auto step_scopes = GetStepScopes(scope);
@@ -88,7 +89,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
        // the weight are located in parent scope
        for (auto& var_name : input.second) {
          if (!step_scope.FindVar(var_name)) {
-            step_scope.NewVar(var_name)->GetMutable<Tensor>();
+            step_scope.NewVar(var_name)->GetMutable<LoDTensor>();
          }
        }
      }
@@ -106,11 +107,12 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
 void RecurrentAlgorithm::InitMemories(Scope* step_scope,
                                      bool infer_shape_mode) const {
  for (auto& attr : arg_->memories) {
-    Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
+    auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "memory [%s]'s boot variable [%s] not exists", attr.var,
                   attr.boot_var);
-    Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
+    auto* boot_mem =
+        step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
    if (infer_shape_mode) {
      pre_mem->Resize(boot_mem->dims());
      PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
@@ -192,9 +194,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
                   "memory variable [%s] does not exists", attr.var);
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "boot variable [%s] does not exists", attr.boot_var);
-    Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
+    auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
-    Tensor* boot_mem_grad =
+    auto* boot_mem_grad =
-        step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
+        step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
    if (infer_shape_mode) {
      boot_mem_grad->Resize(mem_grad->dims());
    } else {
@@ -205,7 +207,7 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
 void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
-                 ->GetMutable<Tensor>()
+                 ->GetMutable<LoDTensor>()
                 ->dims()[0];
  auto step_scopes = GetStepScopes(scope);
  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,

--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -28,7 +28,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    // input check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) shouldn't be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ReshapeOp should not be null.");
    auto shape = ctx.Attr<std::vector<int>>("shape");
    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
    for (auto dim : shape) {
@@ -46,7 +50,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                   [](int a) { return static_cast<int64_t>(a); });
    auto out_dims = framework::make_ddim(shape_int64);
-    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
+    ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
  }
 };
@@ -90,7 +94,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                            "Input(Out@GRAD) shouldn't be null.");
    auto dims = ctx.Input<framework::Tensor>("X")->dims();
-    auto *d_in = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *d_in = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    d_in->Resize(dims);
  }
 };

--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -21,6 +21,7 @@ namespace rnn {
 namespace f = paddle::framework;
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& inlinks, const size_t seq_len,
@@ -31,7 +32,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
    PADDLE_ENFORCE(input_var != nullptr, "input link [%s] is not in scope.",
                   inlinks[i].external);
-    Tensor* input = input_var->GetMutable<Tensor>();
+    LoDTensor* input = input_var->GetMutable<LoDTensor>();
    f::DDim dims = input->dims();
    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
                   "all the inlinks must have same length");
@@ -40,6 +41,8 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
      Tensor* step_input =
          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
      if (!infer_shape_mode) {
+        // The input of operators of each step is Tensor here.
+        // Maybe need to modify Slice function.
        *step_input = input->Slice<float>(j, j + 1);
      }
      step_input->Resize(step_dims);
@@ -54,21 +57,23 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
    auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
    PADDLE_ENFORCE(output_var != nullptr, "output link [%s] is not in scope.",
                   outlinks[i].external);
-    Tensor* output = output_var->GetMutable<Tensor>();
+    LoDTensor* output = output_var->GetMutable<LoDTensor>();
    if (infer_shape_mode) {
      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
      PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                     outlinks[i].internal);
-      f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
+      f::DDim step_dims =
+          step_scope_var->template GetMutable<LoDTensor>()->dims();
      std::vector<int64_t> dims_vec = vectorize(step_dims);
      dims_vec.insert(dims_vec.begin(), seq_len);
      output->Resize(f::make_ddim(dims_vec));
    } else {
      output->mutable_data<float>(platform::CPUPlace());
      for (size_t j = 0; j < seq_len; j++) {
-        Tensor* step_output =
+        LoDTensor* step_output = step_scopes[j]
-            step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
+                                     ->FindVar(outlinks[i].internal)
+                                     ->GetMutable<LoDTensor>();
        // TODO(luotao02) data type and platform::DeviceContext() should set
        // correctly
        (output->Slice<float>(j, j + 1))
@@ -94,8 +99,8 @@ void LinkMemories(const std::vector<Scope*>& scopes,
  auto scope = scopes[step_id];
  auto linked_scope = scopes[step_id + offset];
  for (auto& attr : memories) {
-    auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
+    auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
+    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
    if (infer_shape_mode) {
      mem->Resize(linked_mem->dims());
    } else {

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -25,6 +25,13 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of RowwiseAddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"),
+                            "Input(b) of RowwiseAddOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of RowwiseAddOp should not be null.");
    auto x_dims = ctx.Input<Tensor>("X")->dims();
    auto b_dims = ctx.Input<Tensor>("b")->dims();
    PADDLE_ENFORCE_GT(
@@ -37,7 +44,7 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
        "The width of two operands must be same");
    PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
-    ctx.Output<Tensor>("Out")->Resize(x_dims);
+    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dims);
  }
 };
@@ -76,8 +83,8 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(
        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
        "The width of two operands must be same");
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *db = ctx.Output<Tensor>(framework::GradVarName("b"));
+    auto *db = ctx.Output<framework::LoDTensor>(framework::GradVarName("b"));
    if (dx) dx->Resize(x_dims);
    if (db) db->Resize(b_dims);
  }

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -27,8 +27,13 @@ class ScaleOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of ScaleOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ScaleOp should not be null.");
    auto *in = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
    out->Resize(in->dims());
  }
 };

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -24,6 +24,15 @@ class ScatterOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Ref"),
+                            "Input(Ref) of ScatterOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Index"),
+                            "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Updates"),
+                            "Input(Updates) of ScatterOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of ScatterOp should not be null.");
    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Index")->dims().size(), 1,
                      "Update Index should be 1-D.");
    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Ref")->dims().size(),
@@ -35,7 +44,8 @@ class ScatterOp : public framework::OperatorWithKernel {
    framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
    for (int i = 1; i < data_dim.size(); ++i)
      PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
-    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("Ref")->dims());
+    ctx.Output<framework::LoDTensor>("Out")->Resize(
+        ctx.Input<Tensor>("Ref")->dims());
  }
 };
@@ -45,9 +55,11 @@ class ScatterGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *dUpdates =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Updates"));
    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dRef =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Ref"));
    auto *Ref = ctx.Input<Tensor>("Ref");
    dRef->Resize(Ref->dims());

--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_avg_pool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/sequence_avg_pool_op.h"
+namespace paddle {
+namespace operators {
+class SequenceAvgPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar("X"), "Input(X) of SequenceAvgPoolOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of SequenceAvgPoolOp should not be null.");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto dims = x->dims();
+    auto lod = x->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_GE(
+        dims[0],
+        /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
+        "The first dimension of Input(X) must be large than batch size.");
+    dims[0] = lod[0].size() - 1;
+    ctx.Output<framework::LoDTensor>("Out")->Resize({dims});
+  }
+};
+class SequenceAvgPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceAvgPoolOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of SequenceAvgPoolOp.");
+    AddOutput("Out", "The output of SequenceAvgPoolOp.");
+    AddComment(R"DOC(
+    SequenceAvgPoolOp averages features of all time-steps of each instance.
+    More detailed comments will be added later.
+    )DOC");
+  }
+};
+class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Gradient of Out should not be null");
+    auto og_dims =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->dims();
+    auto x_dims = ctx.Input<framework::LoDTensor>("X")->dims();
+    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
+                      "The rank of output grad must equal to Input(X).");
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
+    }
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    x_grad->Resize(x_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_avg_pool, ops::SequenceAvgPoolOp,
+            ops::SequenceAvgPoolOpMaker, sequence_avg_pool_grad,
+            ops::SequenceAvgPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_avg_pool,
+    ops::SequenceAvgPoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_avg_pool_grad,
+    ops::SequenceAvgPoolGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sequence_avg_pool_op.cu
+++ b/paddle/operators/sequence_avg_pool_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/sequence_avg_pool_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_avg_pool,
+    ops::SequenceAvgPoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sequence_avg_pool_grad,
+    ops::SequenceAvgPoolGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sequence_avg_pool_op.h
+++ b/paddle/operators/sequence_avg_pool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename Place, typename T>
+class SequenceAvgPoolKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    auto dims = in->dims();
+    auto lod = in->lod();
+    int64_t w = in->numel() / dims[0];
+    out->mutable_data<T>(context.GetPlace());
+    auto place = context.GetEigenDevice<Place>();
+    for (int i = 0; i < static_cast<int>(lod[0].size()) - 1; ++i) {
+      Tensor in_t = in->Slice<T>(static_cast<int>(lod[0][i]),
+                                 static_cast<int>(lod[0][i + 1]));
+      Tensor out_t = out->Slice<T>(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[0][i + 1] - lod[0][i]);
+      auto in_e = EigenMatrix<T>::From(in_t, {h, w});
+      auto out_e = EigenMatrix<T>::From(out_t, {h, w});
+      out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+    }
+  }
+};
+template <typename Place, typename T>
+class SequenceAvgPoolGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Output<LoDTensor>("X");
+    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto dims = in->dims();
+    auto lod = in->lod();
+    int64_t w = in->numel() / dims[0];
+    in_g->mutable_data<T>(context.GetPlace());
+    auto place = context.GetEigenDevice<Place>();
+    for (int i = 0; i < static_cast<int>(lod[0].size()) - 1; ++i) {
+      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[0][i]),
+                                   static_cast<int>(lod[0][i + 1]));
+      auto out_g_t = out_g->Slice<T>(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[0][i + 1] - lod[0][i]);
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      Eigen::DSizes<int, 2> bcast(h, w);
+      in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -23,10 +23,18 @@ class SGDOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("param"),
-        ctx.Input<Tensor>("param")->dims() == ctx.Input<Tensor>("grad")->dims(),
+                            "Input(param) of SGDOp should not be null.");
-        "Two input of SGD Op's dimension must be same.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("grad"),
-    ctx.Output<Tensor>("param_out")->Resize(ctx.Input<Tensor>("param")->dims());
+                            "Input(grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("param_out"),
+                            "Output(param_out) of SGDOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("param")->dims(),
+                      ctx.Input<Tensor>("grad")->dims(),
+                      "Two input of SGD Op's dimension must be same.");
+    ctx.Output<framework::LoDTensor>("param_out")
+        ->Resize(ctx.Input<Tensor>("param")->dims());
  }
 };

--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -23,7 +23,13 @@ class SigmoidOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of SigmoidOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of SigmoidOp should not be null.");
+    ctx.Output<framework::LoDTensor>("Y")->Resize(
+        ctx.Input<Tensor>("X")->dims());
  }
 };
@@ -44,7 +50,7 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>(framework::GradVarName("X"))
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("Y")->dims());
  }
 };

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -23,9 +23,15 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of SoftmaxOp should not be null.");
    PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                   "The input of softmax op must be a matrix.");
-    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::LoDTensor>("Y")->Resize(
+        ctx.Input<Tensor>("X")->dims());
  }
 };
@@ -71,7 +77,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                      ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
                      "Input(Y) and its gradients should have a same shape.");
-    ctx.Output<Tensor>(framework::GradVarName("X"))
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -23,12 +23,18 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+    PADDLE_ENFORCE_NOT_NULL(
-                            "Input of SquaredL2DistanceOp "
+        ctx.InputVar("X"),
-                            "must be initialized.");
+        "Input(X) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+    PADDLE_ENFORCE_NOT_NULL(
-                            "Target of SquaredL2DistanceOp "
+        ctx.InputVar("Y"),
-                            "must be initialized.");
+        "Input(Y) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("sub_result"),
+        "Output(sub_result) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of SquaredL2DistanceOp should not be null.");
    auto* x = ctx.Input<Tensor>("X");
    auto x_dims = x->dims();
@@ -48,9 +54,9 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
                   "First dimension of target must be equal to input "
                   "or to 1.");
-    ctx.Output<Tensor>("sub_result")
+    ctx.Output<framework::LoDTensor>("sub_result")
        ->Resize({x_dims[0], x->numel() / x_dims[0]});
-    ctx.Output<Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
  }
 };
@@ -94,8 +100,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(out_dims[1], 1,
                      "Second dimension of output gradient "
                      "must be 1.");
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x_grad =
-    auto* y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* y_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
    if (x_grad) x_grad->Resize(x_dims);
    if (y_grad) y_grad->Resize(y_dims);
  }

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -22,8 +22,13 @@ class SumOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
+                   "Input(X) of SumOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of SumOp should not be null.");
    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
    int N = ins.size();
    auto in_dim = ins[0]->dims();
@@ -55,7 +60,8 @@ class SumGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto outputs = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    auto outputs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
    auto dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
    for (auto output : outputs) {
      output->Resize(dims);

--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -24,7 +24,12 @@ class TopkOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input of TopkOP must be initialized.");
+                            "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Indices"),
+                            "Output(Indices) of TopkOp should not be null.");
    auto *input = ctx.Input<framework::Tensor>("X");
    const int k = static_cast<int>(ctx.Attr<int>("k"));
@@ -35,8 +40,8 @@ class TopkOp : public framework::OperatorWithKernel {
    framework::DDim dims = input->dims();
    dims[dims.size() - 1] = k;
-    ctx.Output<Tensor>("Out")->Resize(dims);
+    ctx.Output<framework::LoDTensor>("Out")->Resize(dims);
-    ctx.Output<Tensor>("Indices")->Resize(dims);
+    ctx.Output<framework::LoDTensor>("Indices")->Resize(dims);
  }
 };

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -48,9 +48,13 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of UniformRandomOp should not be null.");
    PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                   "uniform_random's min must less then max");
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
    auto dims = Attr<std::vector<int>>("dims");
    std::vector<int64_t> temp;
    temp.reserve(dims.size());

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -24,3 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place)
--- a/paddle/operators/concat_op.cu
+++ b/paddle/operators/concat_op.cu
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,8 +12,45 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
+#pragma once
-#include "paddle/operators/concat_op.h"
-namespace ops = paddle::operators;
+#ifndef __NVCC__
-// TODO(Yancey1989) Add GPU kernel
+#error device_ptr_cast must be include by .cu file
+#endif
+#include <thrust/device_ptr.h>
+namespace paddle {
+namespace platform {
+namespace details {
+template <typename T, bool is_ptr>
+struct DevicePtrCast;
+template <typename T>
+struct DevicePtrCast<T, true> {
+  using ELEM = typename std::remove_pointer<T>::type;
+  using RTYPE = thrust::device_ptr<ELEM>;
+  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
+    return thrust::device_pointer_cast(ele);
+  }
+};
+template <typename T>
+struct DevicePtrCast<T, false> {
+  using RTYPE = T;
+  inline RTYPE operator()(RTYPE it) const { return it; }
+};
+// Cast T to thrust::device_ptr if T is a pointer.
+// Otherwise, e.g., T is a iterator, return T itself.
+template <typename T>
+auto DevPtrCast(T t) ->
+    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
+  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+  return cast(t);
+}
+}  // namespace details
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -25,6 +25,10 @@ limitations under the License. */
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif
 #ifndef PADDLE_ONLY_CPU
 #include "paddle/platform/dynload/cublas.h"
@@ -42,6 +46,19 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+namespace {
+#ifdef __GNUC__
+inline std::string demangle(std::string name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void (*)(void*)> res{
+      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
+  return (status == 0) ? res.get() : name;
+}
+#else
+inline std::string demangle(std::string name) { return name; }
+#endif
+}
 struct EnforceNotMet : public std::exception {
  std::exception_ptr exp_;
  std::string err_str_;
@@ -61,8 +78,8 @@ struct EnforceNotMet : public std::exception {
      Dl_info info;
      for (int i = 0; i < size; ++i) {
-        if (dladdr(call_stack[i], &info)) {
+        if (dladdr(call_stack[i], &info) && info.dli_sname) {
-          auto demangled = info.dli_sname;
+          auto demangled = demangle(info.dli_sname);
          auto addr_offset = static_cast<char*>(call_stack[i]) -
                             static_cast<char*>(info.dli_saddr);
          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,

--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/hostdevice.h"
+#include "paddle/platform/place.h"
+#include <algorithm>
+#include <type_traits>
+#ifdef __NVCC__
+#include <thrust/transform.h>
+#include "paddle/platform/details/device_ptr_cast.h"
+#endif
+namespace paddle {
+namespace platform {
+// Transform on host or device. It provides the same API in std library.
+template <typename Place, typename InputIter, typename OutputIter,
+          typename UnaryOperation>
+void Transform(Place place, InputIter first, InputIter last, OutputIter result,
+               UnaryOperation op) {
+  if (is_cpu_place(place)) {
+    std::transform(first, last, result, op);
+  } else {
+#ifdef __NVCC__
+    using namespace details;
+    thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result),
+                      op);
+#else
+    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
+#endif
+  }
+}
+template <typename Place, typename InputIter1, typename InputIter2,
+          typename OutputIter, typename BinaryOperation>
+void Transform(Place place, InputIter1 first1, InputIter1 last1,
+               InputIter2 first2, OutputIter result, BinaryOperation op) {
+  if (is_cpu_place(place)) {
+    std::transform(first1, last1, first2, result, op);
+  } else {
+#ifdef __NVCC__
+    using namespace details;
+    thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2),
+                      DevPtrCast(result), op);
+#else
+    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
+#endif
+  }
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/transform.h"
+template <typename T>
+class Scale {
+ public:
+  explicit Scale(const T& scale) : scale_(scale) {}
+  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
+ private:
+  T scale_;
+};
+template <typename T>
+class Multiply {
+ public:
+  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+TEST(Transform, CPUUnary) {
+  using namespace paddle::platform;
+  float buf[4] = {0.1, 0.2, 0.3, 0.4};
+  Transform(CPUPlace(), buf, buf + 4, buf, Scale<float>(10));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+TEST(Transform, GPUUnary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  GPUPlace gpu0(0);
+  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
+  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
+  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+TEST(Transform, CPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  Transform(CPUPlace(), buf, buf + 4, buf, buf, Multiply<int>());
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
+TEST(Transform, GPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  GPUPlace gpu0(0);
+  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
+  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
\ No newline at end of file
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -19,10 +19,12 @@ limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/cond_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/pybind.h"
 #include "paddle/pybind/tensor_py.h"
 #include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
@@ -31,34 +33,6 @@ limitations under the License. */
 namespace py = pybind11;
-USE_OP(add);
-USE_OP(onehot_cross_entropy);
-USE_OP(sgd);
-USE_OP(mul);
-USE_OP(elementwise_mul);
-USE_OP(mean);
-USE_OP(sigmoid);
-USE_OP(softmax);
-USE_OP(rowwise_add);
-USE_OP(fill_zeros_like);
-USE_NO_KERNEL_OP(recurrent);
-USE_OP(gaussian_random);
-USE_OP(uniform_random);
-USE_OP(lookup_table);
-USE_OP(scale);
-USE_NO_KERNEL_OP(identity);
-USE_OP(minus);
-USE_OP(cos_sim);
-USE_CPU_ONLY_OP(gather);
-USE_OP(pad);
-USE_CPU_ONLY_OP(scatter);
-USE_OP(crop);
-USE_CPU_ONLY_OP(concat);
-USE_OP(top_k);
-USE_OP(squared_l2_distance);
-USE_OP(sum);
-USE_OP(reshape);
 namespace paddle {
 namespace framework {
@@ -124,27 +98,21 @@ PYBIND11_PLUGIN(core) {
        return self.data<float>()[offset];
      });
-  py::class_<LoDTensor>(m, "LoDTensor", R"DOC(LoD(Leval of Ddetails) Tensor.
+  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
+      .def_buffer(
-The tensor and LoD info should be created before creating the LoDTensor, then
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-call the set_tensor and set_lod functions to set them.
+      .def(
+          "__init__",
-)DOC")
+          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-      .def("__init__",
-           [](LoDTensor &instance,
-              const std::vector<std::vector<size_t>> &lod,
-              Tensor *t) {
 #ifdef PADDLE_ONLY_CPU
-             new (&instance) LoDTensor(lod, t);
+            new (&instance) LoDTensor(lod);
 #else
             paddle::framework::LoD new_lod;
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             new (&instance) LoDTensor(new_lod, t);
+             new (&instance) LoDTensor(new_lod);
 #endif
-           })
+          })
-      .def("set_tensor",
-           [](LoDTensor &self, Tensor *tensor) { self.set_tensor(tensor); })
      .def("set_lod",
           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
 #ifdef PADDLE_ONLY_CPU
@@ -156,9 +124,6 @@ call the set_tensor and set_lod functions to set them.
             self.set_lod(new_lod);
 #endif
           })
-      .def("tensor",
-           [](LoDTensor &self) -> Tensor & { return self.tensor(); },
-           py::return_value_policy::reference)
      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
 #ifdef PADDLE_ONLY_CPU
        return self.lod();
@@ -187,9 +152,6 @@ All parameter, weight, gradient are variables in Paddle.
           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
      .def("get_tensor",
-           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
-           py::return_value_policy::reference)
-      .def("get_lod_tensor",
           [](Variable &self) -> LoDTensor * {
             return self.GetMutable<LoDTensor>();
           },
@@ -327,6 +289,28 @@ All parameter, weight, gradient are variables in Paddle.
           [](operators::RecurrentOp &self, const operators::NetOp &net)
               -> void { self.set_stepnet(net.Clone()); });
+  // cond_op
+  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::CondOp * {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto cond_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::CondOp *>(cond_op.release());
+                  })
+      .def("set_truenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_truenet(net.Clone());
+           })
+      .def("set_falsenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_falsenet(net.Clone());
+           });
  m.def("unique_integer", UniqueIntegerGenerator);
  m.def("is_compile_gpu", IsCompileGPU);

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2055,20 +2055,26 @@ class ConvLayerBase(LayerBase):
        if num_filters is not None:
            self.config.num_filters = num_filters
+        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
        use_gpu = int(g_command_config_args.get("use_gpu", 0))
        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-        # Automatically select cudnn_type for GPU and exconv for CPU
+        # Automatically select cudnn_type for GPU, exconv for CPU
+        # and mkldnn_conv for MKLDNN
        # if set type=conv, but still reserve the way user specify
-        # exconv or cudnn_conv manually.
+        # exconv, mkldnn_conv or cudnn_conv manually.
        if self.layer_type == "cudnn_conv":
            config_assert(use_gpu, "cudnn_conv only support GPU")
+        if self.layer_type == "mkldnn_conv":
+            config_assert(use_mkldnn, "mkldnn_conv only support MKLDNN")
        if (use_gpu == 1 and self.layer_type != "exconv" and
+                self.layer_type != "mkldnn_conv" and
            (parallel_nn == 0 or self.config.device > -1)):
            self.layer_type = "cudnn_conv"
        else:
-            self.layer_type = "exconv"
+            self.layer_type = "mkldnn_conv" if use_mkldnn else "exconv"
        # need to specify layer in config
        self.config.type = self.layer_type
@@ -2100,6 +2106,11 @@ class ConvLayer(ConvLayerBase):
    layer_type = 'exconv'
+@config_layer('mkldnn_conv')
+class ConvLayer(ConvLayerBase):
+    layer_type = 'mkldnn_conv'
 @config_layer('cudnn_conv')
 class ConvLayer(ConvLayerBase):
    layer_type = 'cudnn_conv'

--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -215,5 +215,27 @@ class __RecurrentOp__(object):
        return core.RecurrentOp.create(proto.SerializeToString())
+class __CondOp__(object):
+    __proto__ = None
+    type = "cond"
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create condop
+        return core.CondOp.create(proto.SerializeToString())
 Operator = OperatorFactory()  # The default global factory
 RecurrentOp = __RecurrentOp__()
+CondOp = __CondOp__()
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -181,8 +181,10 @@ class OpTest(unittest.TestCase):
        self.op.infer_shape(self.scope)
        ctx = core.DeviceContext.create(place)
        self.op.run(self.scope, ctx)
+        print "finish self.op.run"
        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
+            print "finish Operator.get_op_outputs"
+            print "out_dup=%s; out_name=%s" % (out_dup, out_name)
            if out_dup:
                sub_out = self.outputs[out_name]
                for sub_out_name in sub_out:
@@ -194,12 +196,17 @@ class OpTest(unittest.TestCase):
                            actual, expect, atol=1e-05),
                        "output name: " + out_name + "has diff")
            else:
+                v = self.scope.find_var(out_name)
+                print "var=%s" % v
+                print "tensor=%s" % v.get_tensor()
                actual = np.array(self.scope.find_var(out_name).get_tensor())
+                print "actual=%s" % actual
                expect = self.outputs[out_name]
                self.assertTrue(
                    np.allclose(
                        actual, expect, atol=1e-05),
                    "output name: " + out_name + "has diff")
+                print "finish check in %s" % place
    def check_output(self):
        places = [core.CPUPlace()]

--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
+import logging
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+from paddle.v2.framework.op import Operator, CondOp
+class PySimpleCond(object):
+    '''
+    A simple implementation of dynamic if-else based on numpy
+    '''
+    def __init__(self):
+        array = [1] * 10
+        for i in range(1, 10, 2):
+            array[i] = 0
+        self.cond = np.array(array)
+        self.x = np.ones(shape=(10, 1))
+    def forward(self):
+        self.index_t = np.where(self.cond == 1)
+        self.index_f = np.where(self.cond == 0)
+        y_t = self.x[self.index_t]
+        y_f = self.x[self.index_f]
+        y_t = y_t * 2.
+        y_f = y_f * (-2.)
+        output = np.zeros(shape=(10, 1))
+        output[self.index_t] = y_t
+        output[self.index_f] = y_f
+        return output
+class PySimpleCondTest(unittest.TestCase):
+    def setUp(self):
+        self.condnn = PySimpleCond()
+    def test_forward(self):
+        output = self.condnn.forward()
+def create_tensor(scope, name, shape, np_data):
+    tensor = scope.new_var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+class TestCondOp(unittest.TestCase):
+    '''
+    Test CondOp
+    equation:
+        cond = [True, False, True, False, ...]
+        y[index_t] = x[index_t] * 2.
+        y[index_f] = x[index_f] * -2.
+    outputs:
+        y
+    '''
+    def setUp(self):
+        self.py_cond = PySimpleCond()
+    def forward(self):
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_cond_op()
+        self.create_sub_net()
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        self.condop.infer_shape(self.scope)
+        self.condop.run(self.scope, ctx)
+        return np.array(self.scope.find_var("Out").get_tensor())
+    def create_global_variables(self):
+        x_np_data = self.py_cond.x
+        create_tensor(self.scope, "X", [10, 1], x_np_data)
+        cond_np_data = self.py_cond.cond.astype("int32")
+        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
+        self.scope.new_var("SubScopes")
+        self.scope.new_var("IndexTensors")
+        self.scope.new_var("Out")
+    def create_cond_op(self):
+        self.condop = CondOp(
+            Cond="cond",
+            Xs=["X"],
+            Outs=["Out"],
+            SubScopes="SubScopes",
+            IndexTensors="IndexTensors")
+    def create_sub_net(self):
+        truenet = core.Net.create()
+        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
+        truenet.append_op(scale_op_t)
+        truenet.complete_add_op(True)
+        self.condop.set_truenet(truenet)
+        falsenet = core.Net.create()
+        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
+        falsenet.append_op(scale_op_t)
+        falsenet.complete_add_op(True)
+        self.condop.set_falsenet(falsenet)
+    def test_forward(self):
+        print 'test cond op forward'
+        pd_output = self.forward()
+        py_output = self.py_cond.forward()
+        print 'pd_output', pd_output
+        print
+        print 'py_output', py_output
+        self.assertEqual(pd_output.shape, py_output.shape)
+        print 'test passed'
+        return 0
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -7,8 +7,8 @@ class TestCosSimOp(OpTest):
    def setUp(self):
        self.op_type = "cos_sim"
        self.inputs = {
-            'X': np.random.random((10, 5)).astype("float32"),
+            'X': np.random.random((6, 5)).astype("float32"),
-            'Y': np.random.random((10, 5)).astype("float32")
+            'Y': np.random.random((6, 5)).astype("float32")
        }
        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
@@ -28,12 +28,66 @@ class TestCosSimOp(OpTest):
    def test_check_grad_ingore_x(self):
        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
-    def test_check_grad_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
-if __name__ == "__main__":
+class TestCosSimOp2(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5)).astype("float32"),
+            'Y': np.random.random((1, 5)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+class TestCosSimOp3(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5, 2)).astype("float32"),
+            'Y': np.random.random((6, 5, 2)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+class TestCosSimOp4(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5, 2)).astype("float32"),
+            'Y': np.random.random((1, 5, 2)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_crop_op.py
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
@@ -52,39 +52,40 @@ class TestCropOp(OpTest):
    def test_check_output(self):
        self.check_output()
+        print "finish check_output"
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+    #def test_check_grad_normal(self):
+    #    self.check_grad(['X'], 'Out', max_relative_error=0.006)
-class TestCase1(TestCropOp):
+    #class TestCase1(TestCropOp):
-    def initTestCase(self):
+    #    def initTestCase(self):
-        self.x_shape = (16, 16, 16)
+    #        self.x_shape = (16, 16, 16)
-        self.crop_shape = [2, 2, 3]
+    #        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
+    #        self.offsets = [1, 5, 3]
+    #
+    #
-class TestCase2(TestCropOp):
+    #class TestCase2(TestCropOp):
-    def initTestCase(self):
+    #    def initTestCase(self):
-        self.x_shape = (4, 4)
+    #        self.x_shape = (4, 4)
-        self.crop_shape = [4, 4]
+    #        self.crop_shape = [4, 4]
-        self.offsets = [0, 0]
+    #        self.offsets = [0, 0]
+    #
+    #
-class TestCase3(TestCropOp):
+    #class TestCase3(TestCropOp):
-    def initTestCase(self):
+    #    def initTestCase(self):
-        self.x_shape = (16, 16, 16)
+    #        self.x_shape = (16, 16, 16)
-        self.crop_shape = [2, 2, 3]
+    #        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
+    #        self.offsets = [1, 5, 3]
-        self.crop_by_input = True
+    #        self.crop_by_input = True
+    #
+    #
-class TestCase4(TestCropOp):
+    #class TestCase4(TestCropOp):
-    def initTestCase(self):
+    #    def initTestCase(self):
-        self.x_shape = (4, 4)
+    #        self.x_shape = (4, 4)
-        self.crop_shape = [4, 4]
+    #        self.crop_shape = [4, 4]
-        self.offsets = [0, 0]
+    #        self.offsets = [0, 0]
-        self.crop_by_input = True
+    #        self.crop_by_input = True
+    #
 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -4,7 +4,7 @@ from paddle.v2.framework.op import Operator
 import numpy
-class GaussianRandomTest(unittest.TestCase):
+class TestGaussianRandomOp(unittest.TestCase):
    def test_cpu(self):
        self.gaussian_random_test(place=core.CPUPlace())

--- a/python/paddle/v2/framework/tests/test_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_identity_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestIdentityOp(OpTest):
+    def setUp(self):
+        self.op_type = "identity"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X']}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_lookup_table.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
-class MinusOpTest(OpTest):
+class TestMinusOp(OpTest):
    def setUp(self):
        self.op_type = "minus"
        self.inputs = {

--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -3,25 +3,27 @@ import numpy
 from op_test import OpTest
-class TestCrossEntropy(OpTest):
+class TestOnehotCrossEntropyOp(OpTest):
    def setUp(self):
        self.op_type = "onehot_cross_entropy"
        batch_size = 30
        class_num = 10
        X = numpy.random.uniform(0.1, 1.0,
                                 [batch_size, class_num]).astype("float32")
-        label = (class_num / 2) * numpy.ones(batch_size).astype("int32")
+        labels = numpy.random.randint(0, class_num, batch_size, dtype="int32")
-        self.inputs = {'X': X, 'label': label}
-        Y = []
+        cross_entropy = numpy.asmatrix(
-        for i in range(0, batch_size):
+            [[-numpy.log(X[i][labels[i]])] for i in range(X.shape[0])],
-            Y.append(-numpy.log(X[i][label[i]]))
+            dtype="float32")
-        self.outputs = {'Y': numpy.array(Y).astype("float32")}
+        self.inputs = {"X": X, "label": labels}
+        self.outputs = {"Y": cross_entropy}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y")
 if __name__ == "__main__":

--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
@@ -22,7 +22,7 @@ class TestPadOp(OpTest):
        self.check_output()
    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
    def initTestCase(self):
        self.shape = (16, 16)

--- a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
@@ -3,20 +3,7 @@ import numpy as np
 from op_test import OpTest
-class IdentityTest(OpTest):
+class TestScaleOp(OpTest):
-    def setUp(self):
-        self.op_type = "identity"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.outputs = {'Out': self.inputs['X']}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-class ScaleTest(OpTest):
    def setUp(self):
        self.op_type = "scale"
        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}

--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
-class TestSGD(OpTest):
+class TestSGDOp(OpTest):
    def setUp(self):
        self.op_type = "sgd"
        w = np.random.random((102, 105)).astype("float32")

--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
-class TestSigmoid(OpTest):
+class TestSigmoidOp(OpTest):
    def setUp(self):
        self.op_type = "sigmoid"
        self.inputs = {

--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -44,79 +44,66 @@ class TestTensor(unittest.TestCase):
        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
    def test_int_lod_tensor(self):
-        places = [core.CPUPlace(), core.GPUPlace(0)]
+        place = core.CPUPlace()
-        for place in places:
+        scope = core.Scope()
-            scope = core.Scope()
+        var_lod = scope.new_var("test_lod_tensor")
-            var = scope.new_var("test_tensor")
+        lod_tensor = var_lod.get_tensor()
-            var_lod = scope.new_var("test_lod_tensor")
-            tensor = var.get_tensor()
-            lod_tensor = var_lod.get_lod_tensor()
-            tensor.set_dims([4, 4, 6])
-            tensor.alloc_int(place)
-            array = numpy.array(tensor)
-            array[0, 0, 0] = 3
-            array[3, 3, 5] = 10
-            tensor.set(array, place)
-            lod_tensor.set_tensor(tensor)
+        lod_tensor.set_dims([4, 4, 6])
-            lod_tensor.set_lod([[0, 2, 4]])
+        lod_tensor.alloc_int(place)
+        array = numpy.array(lod_tensor)
+        array[0, 0, 0] = 3
+        array[3, 3, 5] = 10
+        lod_tensor.set(array, place)
+        lod_tensor.set_lod([[0, 2, 4]])
-            lod_v = numpy.array(lod_tensor.tensor())
+        lod_v = numpy.array(lod_tensor)
-            self.assertTrue(numpy.alltrue(array == lod_v))
+        self.assertTrue(numpy.alltrue(array == lod_v))
-            lod = lod_tensor.lod()
+        lod = lod_tensor.lod()
-            self.assertEqual(0, lod[0][0])
+        self.assertEqual(0, lod[0][0])
-            self.assertEqual(2, lod[0][1])
+        self.assertEqual(2, lod[0][1])
-            self.assertEqual(4, lod[0][2])
+        self.assertEqual(4, lod[0][2])
    def test_float_lod_tensor(self):
-        places = [core.CPUPlace(), core.GPUPlace(0)]
+        place = core.CPUPlace()
-        for place in places:
+        scope = core.Scope()
-            scope = core.Scope()
+        var_lod = scope.new_var("test_lod_tensor")
-            var = scope.new_var("test_tensor")
-            var_lod = scope.new_var("test_lod_tensor")
-            tensor = var.get_tensor()
-            lod_tensor = var_lod.get_lod_tensor()
-            tensor.set_dims([5, 2, 3, 4])
-            tensor.alloc_float(place)
-            tensor_array = numpy.array(tensor)
+        lod_tensor = var_lod.get_tensor()
-            self.assertEqual((5, 2, 3, 4), tensor_array.shape)
+        lod_tensor.set_dims([5, 2, 3, 4])
-            tensor_array[0, 0, 0, 0] = 1.0
+        lod_tensor.alloc_float(place)
-            tensor_array[0, 0, 0, 1] = 2.0
-            tensor.set(tensor_array, place)
-            lod_tensor.set_tensor(tensor)
+        tensor_array = numpy.array(lod_tensor)
+        self.assertEqual((5, 2, 3, 4), tensor_array.shape)
+        tensor_array[0, 0, 0, 0] = 1.0
+        tensor_array[0, 0, 0, 1] = 2.0
+        lod_tensor.set(tensor_array, place)
-            lod_v = numpy.array(lod_tensor.tensor())
+        lod_v = numpy.array(lod_tensor)
-            self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
+        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
-            self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
+        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-            self.assertEqual(len(lod_tensor.lod()), 0)
+        self.assertEqual(len(lod_tensor.lod()), 0)
-            lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-            lod_tensor.set_lod(lod_py)
+        lod_tensor.set_lod(lod_py)
-            lod = lod_tensor.lod()
+        lod = lod_tensor.lod()
-            self.assertListEqual(lod_py, lod)
+        self.assertListEqual(lod_py, lod)
    def test_lod_tensor_init(self):
        scope = core.Scope()
-        var = scope.new_var("test_tensor")
        place = core.CPUPlace()
-        tensor = var.get_tensor()
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        tensor.set_dims([5, 2, 3, 4])
+        lod_tensor = core.LoDTensor(lod_py)
-        tensor.alloc_float(place)
-        tensor_array = numpy.array(tensor)
+        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.alloc_float(place)
+        tensor_array = numpy.array(lod_tensor)
        tensor_array[0, 0, 0, 0] = 1.0
        tensor_array[0, 0, 0, 1] = 2.0
-        tensor.set(tensor_array, place)
+        lod_tensor.set(tensor_array, place)
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor = core.LoDTensor(lod_py, tensor)
+        lod_v = numpy.array(lod_tensor)
-        lod_v = numpy.array(lod_tensor.tensor())
        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
        self.assertListEqual(lod_py, lod_tensor.lod())

--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
@@ -21,6 +21,9 @@ class TestTopkOp(OpTest):
        self.outputs = {'Out': output, 'Indices': indices}
+    def test_check_output(self):
+        self.check_output()
 class TestTopkOp3d(OpTest):
    def setUp(self):
@@ -42,6 +45,9 @@ class TestTopkOp3d(OpTest):
        self.outputs = {'Out': output, 'Indices': indices}
+    def test_check_output(self):
+        self.check_output()
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.core as core
 import numpy
-class UniformRandomTest(unittest.TestCase):
+class TestUniformRandomOp(unittest.TestCase):
    def test_uniform_random_cpu(self):
        self.uniform_random_test(place=core.CPUPlace())