remove conflict

812c5f60 · chengduoZH · 24cf2fcd · 5bf37b91 · 812c5f60 · 812c5f60
100 changed file
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -8,15 +8,19 @@ function clock_to_seconds() {
 }
 function infer() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
  topology=$1
  layer_num=$2
  bs=$3
-  thread=`nproc`
+  trainers=`nproc`
-  if [ $thread -gt $bs ]; then
+  if [ $trainers -gt $bs ]; then
-    thread=$bs
+    trainers=$bs
  fi
-  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+  threads=$((`nproc` / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  export OPENBLAS_NUM_THREADS=$threads
  models_in="models/${topology}-${layer_num}/pass-00000/"
  if [ ! -d $models_in ]; then
@@ -28,7 +32,7 @@ function infer() {
    --config="${topology}.py" \
    --use_mkldnn=False \
    --use_gpu=False \
-    --trainer_count=$thread \
+    --trainer_count=$trainers \
    --log_period=$log_period \
    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \

--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
 set -e
 function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  export OPENBLAS_NUM_THREADS=1
  topology=$1
  layer_num=$2
  bs=$3

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -252,6 +252,11 @@ first_seq
 ..  autoclass:: paddle.v2.layer.first_seq
    :noindex:
+sub_seq
+---------
+..  autoclass:: paddle.v2.layer.sub_seq
+    :noindex:
 concat
 ------
 ..  autoclass:: paddle.v2.layer.concat

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -68,12 +68,6 @@ scale
    :noindex:
-reshape
---------
-..  autofunction:: paddle.v2.fluid.layers.reshape
-    :noindex:
 transpose
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.transpose

--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@@ -79,7 +79,7 @@ class Optimizer(object):
    def minimize(self, loss, parameter_list):
        """Add operations to minimize `loss` by updating `parameter_list`.
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
        `create_optimization_pass()` into one.
        """
        params_grads = self.create_backward_pass(loss, parameter_list)

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -15,7 +15,7 @@
 获取PaddlePaddle的Docker镜像
 ------------------------------
-执行下面的命令获取最新的PaddlePaddle Docker镜像
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
  .. code-block:: bash
@@ -27,7 +27,7 @@
     docker pull docker.paddlepaddle.org/paddle
-下载GPU版本的Docker镜像：
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
  .. code-block:: bash
@@ -54,7 +54,7 @@
 .. _docker_run:
 在Docker中执行PaddlePaddle训练程序
------------------------------
+----------------------------------
 假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
 `PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
@@ -82,7 +82,7 @@
 .. _docker_run_book:
 使用Docker启动PaddlePaddle Book教程
------------------------------
+-----------------------------------
 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
 Pull PaddlePaddle Docker Image
 ------------------------------
-Run the following command to download the latest Docker images:
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
  .. code-block:: bash
@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
     docker pull docker.paddlepaddle.org/paddle
-Download GPU version images:
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
  .. code-block:: bash
@@ -58,7 +58,7 @@ and run:
 .. _docker_run:
 Launch your training program in Docker
------------------------------
+--------------------------------------
 Assume that you have already written a PaddlePaddle program
 named :code:`train.py` under directory :code:`/home/work` (refer to 

--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
 ------------------------------
-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
  .. code-block:: bash
     pip install paddlepaddle
-如果需要安装支持GPU的版本，需要执行：
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
  .. code-block:: bash

--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -12,14 +12,14 @@ Install Using pip
 ------------------------------
 Run the following command to install PaddlePaddle on the current
-machine, it will also download requirements.
+machine, it will also download requirements, the version is cpu_avx_openblas.
  .. code-block:: bash
     pip install paddlepaddle
-If you wish to install GPU version, just run:
+If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
  .. code-block:: bash

--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -7,13 +7,13 @@
 ++++++++
 PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
-执行下面的命令完成快速安装：
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
  .. code-block:: bash
     pip install paddlepaddle
-如果需要安装支持GPU的版本，需要执行：
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
  .. code-block:: bash

--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -8,13 +8,13 @@ Quick Install
 You can use pip to install PaddlePaddle with a single command, supports
 CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install:
+Simply run the following command to install, the version is cpu_avx_openblas:
  .. code-block:: bash
     pip install paddlepaddle
-If you need to install GPU version, run:
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
  .. code-block:: bash

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+if (WITH_GPU)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+else()
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+endif ()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+if (WITH_GPU)
+  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
+else()
+  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+endif()
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/data_transform.h"
+#include "paddle/framework/lod_tensor.h"
 namespace paddle {
 namespace framework {

--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -27,9 +27,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-using DataTransformFN =
+using DataTransformFn = std::function<void(const platform::DeviceContext* ctx,
-    std::function<void(const std::vector<platform::DeviceContext*> ctx,
+                                           const Variable& in, Variable* out)>;
-                       const Variable& in, Variable* out)>;
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 struct KernelTypePairHash {
@@ -47,7 +46,7 @@ struct KernelTypePairHash {
 };
 using DataTransformMap =
-    std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>;
+    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
 class DataTransformFnMap {
 public:
@@ -58,25 +57,25 @@ class DataTransformFnMap {
  }
  void Insert(const OpKernelType& left, const OpKernelType& right,
-              const DataTransformFN& data_tranform_fn) {
+              const DataTransformFn& data_tranform_fn) {
    Insert(std::make_pair(left, right), data_tranform_fn);
  }
  void Insert(const KernelTypePair& kernel_type_pair,
-              const DataTransformFN& data_tranform_fn) {
+              const DataTransformFn& data_tranform_fn) {
    PADDLE_ENFORCE(!Has(kernel_type_pair),
                   "KernelTypePair %s has been registered", "");
    map_.insert({kernel_type_pair, data_tranform_fn});
  }
-  const DataTransformFN& Get(const KernelTypePair& key_pair) const {
+  const DataTransformFn& Get(const KernelTypePair& key_pair) const {
    auto data_transformer = GetNullable(key_pair);
    PADDLE_ENFORCE_NOT_NULL(data_transformer,
-                            "DataTransformFN should not be NULL");
+                            "DataTransformFn should not be NULL");
    return *data_transformer;
  }
-  const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const {
+  const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
    auto it = map_.find(key_pair);
    if (it == map_.end()) {
      return nullptr;

--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -11,36 +11,61 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <array>
+#include <vector>
-#include "paddle/framework/data_transform.h"
 #include <gtest/gtest.h>
+#include "paddle/framework/data_transform.h"
 namespace paddle {
 namespace framework {
 using namespace platform;
+/**
+ * @brief cross validation of different kernel type transform
+ *  We use four bit map represent different combination.
+ *  If the field has multiple possible value, only choose two of them.
+ *  For DataType, only test the FP32(float), FP64(double).
+ *  e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
+ *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
+ */
+std::array<proto::DataType, 2> kDataType = {
+    {proto::DataType::FP32, proto::DataType::FP64}};
+std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
+std::array<DataLayout, 2> kDataLayout = {
+    {DataLayout::kNHWC, DataLayout::kNCHW}};
+std::array<LibraryType, 2> kLibraryType = {
+    {LibraryType::kPlain, LibraryType::kMKLDNN}};
+OpKernelType GenFromBit(const std::vector<bool> bits) {
+  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
+                      kLibraryType[bits[3]]);
+}
 int test_value = 0;
-OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+auto kernel0 = GenFromBit({0, 0, 0, 0});
-                           LibraryType::kCUDNN);
+auto kernel1 = GenFromBit({0, 0, 0, 1});
-OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0),
+auto kernel2 = GenFromBit({0, 0, 1, 0});
-                           DataLayout::kNCHW, LibraryType::kCUDNN);
+auto kernel3 = GenFromBit({0, 0, 1, 1});
-OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
-                           DataLayout::kNCHW, LibraryType::kCUDNN);
-void type1_to_type2(std::vector<platform::DeviceContext*> ctx,
+void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in,
-                    const Variable& in, Variable* out) {
+                     Variable* out) {
  test_value++;
 }
-void type2_to_type3(std::vector<platform::DeviceContext*> ctx,
+void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in,
-                    const Variable& in, Variable* out) {
+                       Variable* out) {
  test_value--;
 }
-void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
+void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in,
-                    const Variable& in, Variable* out) {
+                        Variable* out) {
  test_value += 2;
 }
@@ -49,12 +74,9 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
 namespace frw = paddle::framework;
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2,
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
-                           frw::type1_to_type2);
+REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3,
+REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
-                           frw::type2_to_type3);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
-                           frw::type1_to_type3);
 TEST(DataTransform, Register) {
  using namespace paddle::framework;
@@ -62,17 +84,16 @@ TEST(DataTransform, Register) {
  auto& instance = DataTransformFnMap::Instance();
  ASSERT_EQ(instance.Map().size(), 3UL);
-  std::vector<DeviceContext*> ctx;
+  DeviceContext* ctx = nullptr;
  paddle::framework::Variable in;
  paddle::framework::Variable out;
-  instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in,
+  instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
-                                                                       &out);
  ASSERT_EQ(test_value, 1);
-  instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
-                                                                       &out);
+  instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
  ASSERT_EQ(test_value, 0);
-  instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
-                                                                       &out);
+  instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
  ASSERT_EQ(test_value, 2);
 }
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
-#include <algorithm>
-#include <iostream>
-#include <memory>
 #include <set>
-#include <vector>
+#include "gflags/gflags.h"
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
-#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/scope.h"
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 namespace paddle {
 namespace framework {
@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  }
 }
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+      tensor.type().hash_code() != typeid(double).hash_code()) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
+  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+}
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  // TODO(tonyyang-svail):
@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugString();
    op->Run(*local_scope, place_);
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : op->OutputVars(true)) {
+        auto* var = local_scope->FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        }
+      }
+    }
  }
-  if (create_local_scope) {
+  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  }
 }

--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
    places.emplace_back(platform::CPUPlace());
    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
  }
-  platform::DeviceContextPool::Create(places);
+  platform::DeviceContextPool::Init(places);
  return true;
 }

--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
 void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                       const platform::DeviceContext &dev_ctx) {
-  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version for LoDTensor
-  {  // the 1st field, uint32_t version
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
  }
-  {  // the 2nd field, tensor description
+  {
-     // int32_t  size
+    // the 2st field, LoD information
-     // void*    protobuf message
+    // uint64_t lod_level
-    proto::TensorDesc desc;
+    // uint64_t lod_level_1 size in byte.
-    desc.set_data_type(framework::ToDataType(tensor.type()));
+    // int*     lod_level_1 data
-    auto dims = framework::vectorize(tensor.dims());
+    // ...
-    auto *pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    auto *data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                   "Index overflow when writing tensor");
-    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto &gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void *>(data), size_to_write,
-                     gpu_dev_ctx.stream());
-        gpu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      os.write(static_cast<const char *>(data_ptr),
-               static_cast<std::streamsize>(size));
-    }
-  }
-  {  // the 4th field, lod information
-     // uint64_t lod_level
-     // uint64_t lod_level_1 size in byte.
-     // int*     lod_level_1 data
-     // ...
    auto lod = tensor.lod();
    uint64_t size = lod.size();
    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
               static_cast<std::streamsize>(size));
    }
  }
+  // the 3st field, Tensor
+  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
-  uint32_t version;
+  {
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    // the 1st field, unit32_t version for SelectedRows
-  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    uint32_t version;
-  proto::TensorDesc desc;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  {  // int32_t size
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-     // proto buffer
+  }
-    int32_t size;
+  {
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    // the 2st field, LoD information
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char *>(buf.get()), size);
-    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                   "Cannot parse tensor desc");
-  }
-  {  // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-    void *buf;
-    platform::Place cpu = platform::CPUPlace();
-    switch (desc.data_type()) {
-      case proto::FP32:
-        buf = tensor->mutable_data<float>(cpu);
-        break;
-      case proto::FP64:
-        buf = tensor->mutable_data<double>(cpu);
-        break;
-      case proto::INT32:
-        buf = tensor->mutable_data<int>(cpu);
-        break;
-      case proto::INT64:
-        buf = tensor->mutable_data<int64_t>(cpu);
-        break;
-      default:
-        PADDLE_THROW("DataType %d not supported", desc.data_type());
-    }
-    is.read(static_cast<char *>(buf), tensor->memory_size());
-  }
-  {  // read lod
    uint64_t lod_level;
    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
    auto &lod = *tensor->mutable_lod();
@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
      lod[i] = tmp;
    }
  }
+  // the 3st filed, Tensor
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor));
 }
 }  // namespace framework

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
  EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
 }
+TEST_F(LoDTensorTester, SerializeAndDeserialize) {
+  LoDTensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+  std::ostringstream oss;
+  SerializeToStream(oss, lod_tensor_, cpu_ctx);
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor);
+  float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
+  for (int i = 0; i < kLodTensorSize; ++i) {
+    EXPECT_EQ(dst_ptr[i], i);
+  }
+  EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
+}
 TEST(LodExpand, test) {
  LoD lod{{0, 2}};
  LoDTensor tensor;

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
  need_update_ = true;
 }
+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
 OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
    : desc_(desc), need_update_(false) {
  // restore inputs_

--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -35,6 +35,8 @@ class OpDesc {
  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
+  void CopyFrom(const OpDesc &op_desc);
  proto::OpDesc *Proto();
  std::string Type() const { return desc_.type(); }

--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -68,6 +68,8 @@ struct OpKernelType {
           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
           library_type_ == o.library_type_;
  }
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
 };
 inline std::ostream& operator<<(std::ostream& os,
@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
  return os;
 }
+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
                              LibraryType::kCUDNN);
-  std::ostringstream stream;
-  stream << op_kernel_type;
  ASSERT_EQ(
-      stream.str(),
+      paddle::framework::KernelTypeToString(op_kernel_type),
      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
 }

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const Scope& scope_;
 };
+const platform::DeviceContext* GetDeviceContext(
+    framework::KernelTypePair& kernel_pair) {
+  auto& actual_kernel_key = kernel_pair.first;
+  auto& expected_kernel_key = kernel_pair.second;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  if (platform::is_gpu_place(actual_kernel_key.place_) &&
+      platform::is_cpu_place(expected_kernel_key.place_)) {
+    return pool.Get(actual_kernel_key.place_);
+  } else if (platform::is_cpu_place(actual_kernel_key.place_) &&
+             platform::is_gpu_place(expected_kernel_key.place_)) {
+    return pool.Get(expected_kernel_key.place_);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
 void OperatorWithKernel::Run(const Scope& scope,
                             const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto dev_ctx = pool.Borrow(place);
+  auto dev_ctx = pool.Get(place);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
  }
  if (actual_kernel_key == expected_kernel_key) {
-    kernel_iter->second->Compute(ctx);
+    PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
+                      "Currently, model parallelism is only supported between "
+                      "CPU and other devices. For example, multi-GPU model "
+                      "parallelism will failed.");
  } else {
-    Scope& op_scope = scope.NewScope();
+    auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
-    auto input_vars = this->InputVars();
+    const DataTransformFn* trans_fun =
-    for (auto var_name : input_vars) {
+        DataTransformFnMap::Instance().GetNullable(kernel_pair);
-      op_scope.Var(var_name);
+    if (trans_fun) {
-    }
+      auto input_vars = this->InputVars();
+      // TODO(qijun) filter the input vars that do not need to be transformed
-    // TODO(qijun) get appropriate DeviceContext from DeviceContext pool
-    platform::DeviceContext* trans_dev_ctx = nullptr;
+      // filter vars that has been transformed
-    std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx};
+      std::vector<std::string> need_trans;
+      for (auto var_name : input_vars) {
+        auto var_name_trans =
+            var_name + framework::KernelTypeToString(expected_kernel_key);
+        if (!scope.FindVar(var_name_trans)) {
+          const_cast<Scope&>(scope).Var(var_name_trans);
+          need_trans.push_back(var_name);
+        }
+      }
-    // TODO(qijun) get appropriate DataTransformFN from global map
+      if (!need_trans.empty()) {
-    framework::DataTransformFN trans_fun = nullptr;
+        auto trans_dev_ctx = GetDeviceContext(kernel_pair);
-    // Wait for transform starting
+        // Wait for transform starting
-    dev_ctx->Wait();
+        dev_ctx->Wait();
-    for (auto var_name : input_vars) {
+        for (auto var_name : need_trans) {
-      trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)),
+          (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)),
-                op_scope.FindVar(var_name));
+                       scope.FindVar(var_name + framework::KernelTypeToString(
-    }
+                                                    expected_kernel_key)));
-    // Wait for data transform finishing
+        }
-    for (auto ctx : trans_dev_ctx_vec) {
+        // Wait for data transform finishing
-      ctx->Wait();
+        trans_dev_ctx->Wait();
+      }
    }
-    // Create a new ExecutionContext
-    ExecutionContext op_ctx(*this, op_scope, *dev_ctx);
-    kernel_iter->second->Compute(op_ctx);
  }
+  kernel_iter->second->Compute(ctx);
 }
 OpKernelType OperatorWithKernel::GetActualKernelType(

--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
@@ -12,5 +12,58 @@ limitations under the License. */
 #include "paddle/framework/selected_rows.h"
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  SerializeToStream(os, selected_rows.value(), dev_ctx);
+}
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
+  auto tensor = *selected_rows->mutable_value();
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  DeserializeFromStream(is, &tensor);
+}
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -59,5 +59,14 @@ class SelectedRows {
  int64_t height_;
 };
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) {
  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
 }
+TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+  SelectedRows dst_tensor;
+  platform::CPUDeviceContext cpu_ctx(place_);
+  std::ostringstream oss;
+  SerializeToStream(oss, *selected_rows_, cpu_ctx);
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor);
+  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
+  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -178,7 +178,7 @@ class Tensor {
  DDim dims_;
  /**
-   * @brief the layout of memory block, default is NCHW.
+   * @brief the layout of memory block, default is NHWC.
   *
   * @note the memory allocation order, describe how weight/data is stored
   *       For example, in 4-D Tensor(rank=4), there are three commonly

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,12 +15,13 @@
 #include <gtest/gtest.h>
 #include <string>
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
 TEST(Tensor, Dims) {
-  using namespace paddle::framework;
+  framework::Tensor tt;
-  using namespace paddle::platform;
-  Tensor tt;
  tt.Resize({2, 3, 4});
-  DDim dims = tt.dims();
+  framework::DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {
    EXPECT_EQ(i + 2, dims[i]);
@@ -28,12 +29,12 @@ TEST(Tensor, Dims) {
 }
 TEST(Tensor, DataAssert) {
-  paddle::framework::Tensor src_tensor;
+  framework::Tensor src_tensor;
  bool caught = false;
  try {
    src_tensor.data<double>();
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (platform::EnforceNotMet err) {
    caught = true;
    std::string msg =
        "holder_ should not be null\nTensor holds no memory. Call "
@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) {
   because Memory::Alloc() and Memory::Free() have not been ready.
 */
 TEST(Tensor, MutableData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
    float* p1 = nullptr;
    float* p2 = nullptr;
    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CPUPlace());
    EXPECT_NE(p1, nullptr);
    // set src_tensor a new dim with large size
    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CPUPlace());
    EXPECT_NE(p2, nullptr);
    EXPECT_NE(p1, p2);
    // set src_tensor a new dim with same size
    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CPUPlace());
    EXPECT_EQ(p1, p2);
    // set src_tensor a new dim with smaller size
    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CPUPlace());
    EXPECT_EQ(p1, p2);
  }
 #ifdef PADDLE_WITH_CUDA
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
    float* p1 = nullptr;
    float* p2 = nullptr;
    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CUDAPlace());
    EXPECT_NE(p1, nullptr);
    // set src_tensor a new dim with large size
    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CUDAPlace());
    EXPECT_NE(p2, nullptr);
    EXPECT_NE(p1, p2);
    // set src_tensor a new dim with same size
    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CUDAPlace());
    EXPECT_EQ(p1, p2);
    // set src_tensor a new dim with smaller size
    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CUDAPlace());
    EXPECT_EQ(p1, p2);
  }
 #endif
 }
 TEST(Tensor, ShareDataWith) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
-    Tensor dst_tensor;
+    framework::Tensor dst_tensor;
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) {
    }
    ASSERT_TRUE(caught);
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CPUPlace());
    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
 #ifdef PADDLE_WITH_CUDA
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
-    Tensor dst_tensor;
+    framework::Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CUDAPlace());
    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) {
 }
 TEST(Tensor, Slice) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
-    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
+    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
+                                 platform::CPUPlace());
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    framework::DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 3);
    EXPECT_EQ(slice_dims[0], 2);
    EXPECT_EQ(slice_dims[1], 3);
@@ -153,11 +159,12 @@ TEST(Tensor, Slice) {
    uintptr_t src_data_address =
        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
    uintptr_t slice_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+    uintptr_t slice_mutable_data_address =
-        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
+            slice_tensor.dims(), platform::CPUPlace()));
    EXPECT_EQ(src_data_address, src_mutable_data_address);
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
@@ -165,22 +172,25 @@ TEST(Tensor, Slice) {
 #ifdef PADDLE_WITH_CUDA
  {
-    Tensor src_tensor;
+    framework::Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
+                                    platform::CUDAPlace());
-    DDim slice_dims = slice_tensor.dims();
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
    EXPECT_EQ(slice_dims[0], 4);
    EXPECT_EQ(slice_dims[1], 9);
    uintptr_t src_data_address =
        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+    uintptr_t src_mutable_data_address =
-        src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace()));
    uintptr_t slice_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+    uintptr_t slice_mutable_data_address =
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace()));
    EXPECT_EQ(src_data_address, src_mutable_data_address);
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
@@ -189,23 +199,19 @@ TEST(Tensor, Slice) {
 }
 TEST(Tensor, ReshapeToMatrix) {
-  using namespace paddle::framework;
+  framework::Tensor src;
-  using namespace paddle::platform;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
-  Tensor src;
-  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
    src_ptr[i] = i;
  }
-  Tensor res = ReshapeToMatrix(src, 2);
+  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
  ASSERT_EQ(res.dims()[0], 2 * 3);
  ASSERT_EQ(res.dims()[1], 4 * 9);
 }
 TEST(Tensor, Layout) {
-  using namespace paddle::framework;
+  framework::Tensor src;
-  using namespace paddle::platform;
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
-  Tensor src;
+  src.set_layout(framework::DataLayout::kAnyLayout);
-  ASSERT_EQ(src.layout(), DataLayout::kNHWC);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
-  src.set_layout(DataLayout::kAnyLayout);
-  ASSERT_EQ(src.layout(), DataLayout::kAnyLayout);
 }
--- a/paddle/framework/tensor_util.cc
+++ b/paddle/framework/tensor_util.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/tensor_util.h"
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    CopyFrom(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor_util.cu
+++ b/paddle/framework/tensor_util.cu
+./tensor_util.cc
\ No newline at end of file
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace framework {
@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
               src_ptr, size);
 }
+// Returns true if a tensor contains NAN, i.e., Not A Number.
+bool HasNAN(const framework::Tensor& tensor);
+// Returns true if a tensor contains Inf, i.e., Infinity.
+bool HasInf(const framework::Tensor& tensor);
+inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
+                              const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void* buf;
+    platform::Place cpu = platform::CPUPlace();
+    // TODO(Yancey1989): use VisiterDataType instead of DataType switch
+    switch (desc.data_type()) {
+      case proto::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case proto::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case proto::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case proto::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char*>(buf), tensor->memory_size());
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -13,6 +13,7 @@
 #include "paddle/framework/tensor_util.h"
 #include <gtest/gtest.h>
+#include <cmath>
 #include <string>
 namespace paddle {
@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) {
 #endif
 }
+TEST(HasNAN, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  float* buf = src.mutable_data<float>({3}, CPUPlace());
+  buf[0] = 0.0;
+  buf[1] = NAN;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasNAN(src));
+}
+TEST(HasInf, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  double* buf = src.mutable_data<double>({3}, CPUPlace());
+  buf[0] = 1.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasInf(src));
+}
+TEST(Tensor, SerializeAndDeserialize) {
+  framework::Tensor src_tensor;
+  int array[6] = {1, 2, 3, 4, 5, 6};
+  src_tensor.Resize({2, 3});
+  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    src_ptr[i] = array[i];
+  }
+  {
+    framework::Tensor dst_tensor;
+    auto place = new platform::CPUPlace();
+    platform::CPUDeviceContext cpu_ctx(*place);
+    std::ostringstream oss;
+    SerializeToStream(oss, src_tensor, cpu_ctx);
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor gpu_tensor;
+    gpu_tensor.Resize({2, 3});
+    Tensor dst_tensor;
+    auto gpu_place = new platform::CUDAPlace();
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    std::ostringstream oss;
+    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 6; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete gpu_place;
+  }
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_util_test.cu
+++ b/paddle/framework/tensor_util_test.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "gtest/gtest.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+namespace paddle {
+namespace framework {
+static __global__ void FillNAN(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = NAN;
+}
+static __global__ void FillInf(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.5;
+}
+TEST(HasNAN, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasNAN(tensor));
+}
+TEST(HasInf, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasInf(tensor));
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <functional>
+#include <future>
 #include <mutex>
 #include <queue>
 #include <thread>
@@ -25,10 +26,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-typedef std::function<void()> Task;
 class ThreadPool {
 public:
+  typedef std::packaged_task<void()> Task;
+  typedef std::function<void()> Fun;
  /**
   * @brief   Get a instance of threadpool, the thread number will
   *          be specified as the number of hardware thread contexts
@@ -61,13 +63,18 @@ class ThreadPool {
  /**
   * @brief   Push a function to the queue, and will be scheduled and
   *          executed if a thread is available.
-   * @param[in] Task  will be pushed to the task queue.
+   * @param[in] Task, will be pushed to the task queue.
+   * @return    std::future<void>, we could wait for the task finished by
+   *            f.wait().
   */
-  void Run(const Task& fn) {
+  std::future<void> Run(const Fun& fn) {
    std::unique_lock<std::mutex> lock(mutex_);
-    tasks_.push(fn);
+    Task task(std::bind(fn));
+    std::future<void> f = task.get_future();
+    tasks_.push(std::move(task));
    lock.unlock();
    scheduled_.notify_one();
+    return f;
  }
  /**
@@ -110,7 +117,7 @@ class ThreadPool {
        break;
      }
      // pop a task from the task queue
-      auto task = tasks_.front();
+      auto task = std::move(tasks_.front());
      tasks_.pop();
      --available_;

--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@@ -20,16 +20,21 @@ limitations under the License. */
 namespace framework = paddle::framework;
 void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  std::vector<std::future<void>> fs;
  for (int i = 0; i < cnt; ++i) {
-    pool->Run([&sum]() { sum.fetch_add(1); });
+    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
+    fs.push_back(std::move(f));
+  }
+  for (auto& f : fs) {
+    f.wait();
  }
 }
 TEST(ThreadPool, ConcurrentInit) {
  framework::ThreadPool* pool;
-  int concurrent_cnt = 50;
+  int n = 50;
  std::vector<std::thread> threads;
-  for (int i = 0; i < concurrent_cnt; ++i) {
+  for (int i = 0; i < n; ++i) {
    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
    threads.push_back(std::move(t));
  }
@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) {
  }
 }
-TEST(ThreadPool, ConcurrentStart) {
+TEST(ThreadPool, ConcurrentRun) {
  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
  std::atomic<int> sum(0);
  std::vector<std::thread> threads;
-  int concurrent_cnt = 50;
+  int n = 50;
  // sum = (n * (n + 1)) / 2
-  for (int i = 1; i <= concurrent_cnt; ++i) {
+  for (int i = 1; i <= n; ++i) {
    std::thread t(do_sum, pool, std::ref(sum), i);
    threads.push_back(std::move(t));
  }
@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) {
    t.join();
  }
  pool->Wait();
-  EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2);
+  EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().tensor();
    default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
  }
 }

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,14 +126,165 @@ public:
      inputData += inputChannels * inputHeight * inputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
+  }
+};
 #ifdef PADDLE_MOBILE_INFERENCE
-    if (Device == DEVICE_TYPE_CPU) {
-      memory_.reset();
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
    }
-#endif
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape;
+    real* colData = NULL;
+    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    size_t colWidth = outputHeight * outputWidth;
+    // Max col matrix height 256, Max col matrix width 1024
+    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    int nStride = colWidth;
+    int kStride = colHeight;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+               colHeightStart += stepColHeight) {
+            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+                 colWidthStart += stepColWidth) {
+              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              // im2col
+              im2col(inputData + g * inputOffset,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     colHeightStart,
+                     K,
+                     colWidthStart,
+                     N);
+              // gemm
+              int M = outputChannels / groups_;
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
+        }
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+    memory_.reset();
  }
 };
+#endif
 /*
 * \brief Backward input calculation of convolution.
 */
@@ -348,7 +499,11 @@ public:
  }
 };
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,4 +98,54 @@ public:
                  int dilationWidth = 1);
 };
+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
 }  // namespace paddle
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 #endif
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          0,
+                          height,
+                          0,
+                          width);
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
  }
  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1UL);
+  CHECK_EQ(config_.inputs_size(), 1);
  const NormConfig& conf = config_.inputs(0).norm_conf();
  localSize_ = conf.size();
  alpha_ = conf.scale();

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
@@ -48,6 +49,11 @@ function(op_library TARGET)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
    endif()
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
@@ -181,56 +187,26 @@ endfunction()
 add_subdirectory(math)
 add_subdirectory(nccl)
-set(DEPS_OPS
+if(WITH_GPU)
-    cond_op
+    op_library(nccl_op DEPS nccl_common)
-    cross_entropy_op
+else()
-    recurrent_op
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
-    softmax_with_cross_entropy_op
+endif()
-    softmax_op
-    sequence_softmax_op
-    sum_op
-    pool_op
-    maxout_op
-    unpool_op
-    pool_with_index_op
-    conv_op
-    conv_transpose_op
-    nccl_op
-    sequence_conv_op
-    sequence_pool_op
-    lod_rank_table_op
-    lod_tensor_to_array_op
-    array_to_lod_tensor_op
-    max_sequence_len_op
-    lstm_op
-    tensor_array_read_write_op
-    gru_op
-    adagrad_op
-    sgd_op
-    save_op
-    load_op
-    send_op
-    recv_op
-    cos_sim_op)
 if(WITH_DISTRIBUTE)
-add_subdirectory(detail)
+    add_subdirectory(detail)
-op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
-set_source_files_properties(
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    send_op.cc
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    PROPERTIES
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
-set_source_files_properties(
+else()
-    recv_op.cc
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
-    PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
 endif()
-op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -243,22 +219,17 @@ op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
-op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(lod_rank_table_op DEPS lod_rank_table)
-op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
-op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
-op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
+op_library(max_sequence_len_op DEPS lod_rank_table)
-op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
-if(WITH_GPU)
-op_library(nccl_op DEPS nccl_common)
-endif()
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
+op_library(recurrent_op DEPS executor)
 op_library(cos_sim_op DEPS cos_sim_functor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
@@ -271,13 +242,12 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 if(WITH_GPU)
-  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    size_t offset;
    if (platform::is_gpu_place(i_tensor.place())) {

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
        }
        auto slice = out->Slice(out_offset, out_offset + len);
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool =
-        auto &dev_ctx = *pool.Borrow(place);
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                            dev_ctx, &slice);

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
        out != nullptr,
        "The Output(Out) should not be null if the Input(X) is set.");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
  }

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope& scope,
           const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& dev_ctx = *pool.Borrow(dev_place);
+    auto& dev_ctx = *pool.Get(dev_place);
    framework::ExecutionContext ctx(*this, scope, dev_ctx);

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
 void CondOp::Run(const Scope& scope, const platform::Place& place) const {
  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Borrow(place);
+  auto& dev_ctx = *pool.Get(place);
  PrepareDataForSubnet(scope, dev_ctx);
  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
  int groups = ctx->Attrs().Get<int>("groups");
  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int input_channels = in_dims[1];
-  int output_channels = filter_dims[0];
  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                 "Conv intput should be 4-D or 5-D tensor.");
@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(
      paddings.size(), strides.size(),
      "Conv paddings dimension and Conv strides dimension should be the same.");
+  int input_channels = in_dims[1];
  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                    "The number of input channels should be equal to filter "
                    "channels * groups.");
+  int output_channels = filter_dims[0];
  PADDLE_ENFORCE_EQ(
      output_channels % groups, 0,
      "The number of output channels should be divided by groups.");

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
-             "where N is the batch size and D is the number of classes. "
+             " where N is the batch size and D is the number of classes. "
             "This input is a probability computed by the previous operator, "
             "which is almost always the result of a softmax operator.");
    AddInput("Label",
             "(Tensor), the ground truth which is a 2-D tensor. When "
             "soft_label is set to false, Label is a Tensor<int64> with shape "
             "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x K].");
+             "Tensor<float/double> with shape [N x D].");
    AddOutput("Y",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
              "[N x 1]. The cross entropy loss.");

--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
    out_item->set_lod(feed_item.lod());

--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {
    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
    dev_ctx.Wait();

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
    }
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    auto &dev_ctx = *pool.Get(dev_place);
    math::set_constant(dev_ctx, &out, value);
  }
 };

--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {
    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      platform::DeviceContextPool &pool =
-      auto &dev_ctx = *pool.Borrow(place);
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
      framework::CopyFrom(tensor, place, dev_ctx, &out);
    }
  }

--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
                   out_var_name);
    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-    framework::DeserializeFromStream(fin, tensor);
+    DeserializeFromStream(fin, tensor);
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    if (platform::is_gpu_place(place)) {
      // copy CPU to GPU

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
        auto slice = out[i].Slice(static_cast<int>(offset),
                                  static_cast<int>(offset + len));
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool =
-        auto &dev_ctx = *pool.Borrow(place);
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                    static_cast<int>(each_range.end)),

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -9,9 +9,9 @@ if(WITH_GPU)
    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
@@ -24,9 +24,9 @@ else()
    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
    cc_library(pooling SRCS pooling.cc DEPS device_context)
    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
    cc_library(unpooling SRCS unpooling.cc DEPS device_context)

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -302,8 +302,29 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
 template struct RowwiseAdd<platform::CPUDeviceContext, float>;
 template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>(
                           TensorSetConstantGPU(context, tensor, value));
 }
+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;

--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  eigen_out.device(*dev) = eigen_in.shuffle(permute);
 }
-template <typename DeviceContext, typename T>
-void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              const framework::Tensor& vector,
-                                              framework::Tensor* output) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector.numel(), size);
-  PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenMatrix<T>::From(vector);
-  auto out = framework::EigenMatrix<T>::From(*output);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.eigen_device()) =
-      in + vec.reshape(shape).broadcast(bcast);
-}
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const framework::Tensor& input,

--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
  void Run(const framework::Scope &scope,
           const platform::Place &dev_place) const override {
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    auto &dev_ctx = *pool.Get(dev_place);
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();

--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
  }
  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
  testing::InitGoogleTest(&argc, argv);

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
                   false /*create_local_scope*/);
      // get device context from pool
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      platform::DeviceContextPool &pool =
-      auto &dev_ctx = *pool.Borrow(place);
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
      // Copy inside::output -> outside::output
      //    outside::output[seq_offset: seq_offset + 1] = inside::output
@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
    auto *program = block->Program();
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
      size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;

--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
    auto x_sliced = x.Slice(x_offset, x_offset + len);
    auto out_sliced = out->Slice(out_offset, out_offset + len);
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
    out_offset += len;
    return out_offset;

--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
    auto &tensor = var->Get<framework::LoDTensor>();
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    framework::SerializeToStream(fout, tensor, dev_ctx);
  }

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Recv operator
-This operator will recv tensor from send_op
+This operator will send tensor to recv_op.
 )DOC");
    AddAttr<std::vector<std::string>>("endpoints",
                                      "(string vector, default 127.0.0.1:6164)"

--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(place);
+    auto &dev_ctx = *pool.Get(place);
    if (dout_var == nullptr) {  // dx_tensor fill zero
      math::set_constant(dev_ctx, &dx_tensor, 0.0f);

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
    auto &x_lod = x.lod();
    auto &mask_dim = mask.dims();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Borrow(dev_place);
+    auto &dev_ctx = *pool.Get(dev_place);
    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
    if (platform::is_cpu_place(mask.place())) {

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
    if (x_tensor.memory_size() > 0) {
      auto *out_tensor = &out->at(offset);
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      platform::DeviceContextPool &pool =
-      auto &dev_ctx = *pool.Borrow(place);
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_tensor.lod());
@@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp {
    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
    size_t offset = GetOffset(scope, place);
    if (offset < x_array.size()) {
-      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      platform::DeviceContextPool &pool =
-      auto &dev_ctx = *pool.Borrow(place);
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_array[offset].lod());
    } else {

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -17,7 +17,7 @@ namespace platform {
 DeviceContextPool* DeviceContextPool::pool = nullptr;
-const platform::DeviceContext* DeviceContextPool::Borrow(
+const platform::DeviceContext* DeviceContextPool::Get(
    const platform::Place& place) {
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
  return it->second;
 }
-std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
-    const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-  std::vector<const platform::DeviceContext*> borrowed_contexts;
-  for (auto& place : places) {
-    auto it = device_contexts_.find(place);
-    if (it != device_contexts_.end()) {
-      borrowed_contexts.emplace_back(it->second);
-    } else {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-  }
-  return borrowed_contexts;
-}
 DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
  PADDLE_ENFORCE_GT(places.size(), 0);

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext {
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
+template <typename Place>
+struct DefaultDeviceContextType;
+template <>
+struct DefaultDeviceContextType<platform::CPUPlace> {
+  using TYPE = CPUDeviceContext;
+};
 #ifdef PADDLE_WITH_CUDA
 class EigenCudaStreamDevice;
@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext {
  cublasHandle_t cublas_handle_;
 };
+template <>
+struct DefaultDeviceContextType<platform::CUDAPlace> {
+  using TYPE = CUDADeviceContext;
+};
 class CUDNNDeviceContext : public CUDADeviceContext {
 public:
  explicit CUDNNDeviceContext(CUDAPlace place);
@@ -109,13 +122,13 @@ class DeviceContextPool {
 public:
  explicit DeviceContextPool(const std::vector<platform::Place>& places);
-  static DeviceContextPool& Get() {
+  static DeviceContextPool& Instance() {
    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
    return *pool;
  }
  /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
    if (pool == nullptr) {
      pool = new DeviceContextPool(places);
    }
@@ -123,13 +136,14 @@ class DeviceContextPool {
  }
  /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Borrow(const platform::Place& place);
+  const platform::DeviceContext* Get(const platform::Place& place);
-  /*! \brief  Return handle of multi-device context. */
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places);
-  ~DeviceContextPool() {}
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }
 private:
  static DeviceContextPool* pool;

--- a/paddle/platform/device_context_test.cu
+++ b/paddle/platform/device_context_test.cu
@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
  using paddle::platform::CPUPlace;
  using paddle::platform::CUDAPlace;
-  DeviceContextPool& pool = DeviceContextPool::Get();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
-  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
-  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
-  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
  std::vector<Place> gpu_places;
  int count = paddle::platform::GetCUDADeviceCount();
  for (int i = 0; i < count; ++i) {
-    gpu_places.emplace_back(CUDAPlace(i));
+    auto dev_ctx = pool.Get(CUDAPlace(i));
-  }
+    ASSERT_NE(dev_ctx, nullptr);
-  auto dev_ctxs = pool.Borrow(gpu_places);
-  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
-    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
-    // check same as CUDAPlace(i)
-    CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
-    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
  }
 }
 int main(int argc, char** argv) {
-  int dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
-                    "device count is "
-                 << dev_count;
-    return 0;
-  }
  std::vector<paddle::platform::Place> places;
  places.emplace_back(paddle::platform::CPUPlace());
@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
  }
  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();

--- a/paddle/platform/for_range.h
+++ b/paddle/platform/for_range.h
@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> {
  template <typename Function>
  inline void operator()(Function func) const {
-    constexpr size_t num_threads = 1024;
+    constexpr int num_threads = 1024;
    int block_size = limit_ <= num_threads ? limit_ : num_threads;
    int grid_size = (limit_ + num_threads - 1) / num_threads;

--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
  }
  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Create(places);
+  paddle::platform::DeviceContextPool::Init(places);
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();

--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <iostream>
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/variant.h"
 namespace paddle {
@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &);
 std::ostream &operator<<(std::ostream &, const Place &);
+template <typename Visitor>
+struct PlaceVisitorWrapper
+    : public boost::static_visitor<typename Visitor::result_type> {
+  const Visitor &visitor_;
+  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
+  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
+    return visitor_(cpu);
+  }
+  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
+    return typename Visitor::result_type();
+#endif
+  }
+};
+template <typename Visitor>
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -3,6 +3,7 @@ if(WITH_PYTHON)
    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
    DEPS pybind python backward proto_desc paddle_memory executor prune init
    ${GLOB_OP_LIB})
+  target_link_libraries(paddle_pybind rt)
 endif(WITH_PYTHON)
 if(WITH_DOC)

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) {
             std::string name = byte_name;
             return self.HasVar(name);
           })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
      .def("find_var",
           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.FindVar(name);
           },
           py::return_value_policy::reference)
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
+           py::return_value_policy::reference)
      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
      .def("op_size", &BlockDesc::OpSize)
      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) {
      .def("set_shape", &VarDesc::SetShape)
      .def("set_dtype", &VarDesc::SetDataType)
      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
      .def("lod_level", &VarDesc::GetLodLevel)
      .def("set_lod_level", &VarDesc::SetLoDLevel)
      .def("type", &VarDesc::GetType)
@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) {
      .value("BLOCK", proto::AttrType::BLOCK);
  py::class_<OpDesc> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDesc::Type)
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
      .def("set_type", &OpDesc::SetType)
      .def("input", &OpDesc::Input)
      .def("input_names", &OpDesc::InputNames)
-      .def("set_input", &OpDesc::SetInput)
      .def("output", &OpDesc::Output)
      .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
      .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
      .def("has_attr", &OpDesc::HasAttr)
      .def("attr_type", &OpDesc::GetAttrType)
      .def("attr_names", &OpDesc::AttrNames)

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle.
    }
    return ret_values;
  });
-  m.def("get_grad_op_descs",
+  m.def(
-        [](const OpDesc &op_desc,
+      "get_grad_op_desc", [](const OpDesc &op_desc,
-           const std::unordered_set<std::string> &no_grad_set,
+                             const std::unordered_set<std::string> &no_grad_set,
-           std::unordered_map<std::string, std::string> &grad_to_var,
+                             const std::vector<BlockDesc *> &grad_sub_block) {
-           const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
-          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
-              framework::OpInfoMap::Instance()
+            framework::OpInfoMap::Instance()
-                  .Get(op_desc.Type())
+                .Get(op_desc.Type())
-                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                                 grad_sub_block);
+                               grad_sub_block);
-          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-          std::transform(
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
-              grad_op_descs.begin(), grad_op_descs.end(),
+                       grad_op_desc_ptrs.begin(),
-              grad_op_desc_ptrs.begin(),
+                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
-          return grad_op_desc_ptrs;
+      });
-        });
  m.def("prune", [](const ProgramDesc &origin,
                    const std::vector<std::array<size_t, 2>> &targets) {
    ProgramDesc prog_with_targets(origin);
@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle.
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
    return new ProgramDesc(pruned_desc);
  });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
  m.def_submodule(
       "var_names",
       "The module will return special predefined variable name in Paddle")

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Borrow(tensor.place()));
+            pool.Get(tensor.place()));
        paddle::platform::GpuMemcpyAsync(
            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -178,7 +178,7 @@ EOF
    # run paddle version to install python packages first
    RUN apt-get update &&\
        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -71,9 +71,7 @@ function threads_config() {
  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
  # according to trainer_count and total processors
  # only when MKL enabled
-  if [ "@WITH_MKL@" == "OFF" ]; then
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
-    return 0
-  fi
  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
  if [ -z $trainers ]; then
@@ -83,12 +81,19 @@ function threads_config() {
  if [ $threads -eq 0 ]; then
    threads=1
  fi
-  if [ -z "$OMP_NUM_THREADS" ]; then
+  if [ "@WITH_MKL@" == "ON" ]; then
-    export OMP_NUM_THREADS=$threads
+    if [ -z "$OMP_NUM_THREADS" ]; then
-  fi
+      export OMP_NUM_THREADS=$threads
-  if [ -z "$MKL_NUM_THREADS" ]; then
+    fi
-    export MKL_NUM_THREADS=$threads
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
  fi
 }
 PADDLE_CONF_HOME="$HOME/.config/paddle"
@@ -150,7 +155,7 @@ fi
 case "$1" in
    "train")
        threads_config $@
-        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
        ;;
    "merge_model")

--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
 LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
 SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -36,7 +36,7 @@ def __read_gflags_from_env__():
    """
    import sys
    import core
-    read_env_flags = ['use_pinned_memory']
+    read_env_flags = ['use_pinned_memory', 'check_nan_inf']
    if core.is_compile_gpu():
        read_env_flags.append('fraction_of_gpu_memory_to_use')
    core.init_gflags([sys.argv[0]] +

--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
 from paddle.v2.fluid import framework as framework
+from . import core
+import collections
-__all__ = ['append_backward_ops']
+__all__ = ['append_backward']
-def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
+def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None,
+                 end_idx=None):
+    if begin_idx is None:
+        begin_idx = 0
+    if end_idx is None:
+        end_idx = len(op_desc_list)
+    for i in range(begin_idx, end_idx):
+        op_desc = op_desc_list[i]
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc.rename_input(old_name, new_name)
+        op_desc.rename_output(old_name, new_name)
+def _create_op_desc_(op_type, inputs, outputs, attrs):
+    op_desc = core.OpDesc()
+    op_desc.set_type(op_type)
+    for para, args in inputs.iteritems():
+        op_desc.set_input(para, args)
+    for para, args in outputs.iteritems():
+        op_desc.set_output(para, args)
+    for name, val in attrs.iteritems():
+        if isinstance(val, framework.Block):
+            op_desc.set_block_attr(name, val.desc)
+        else:
+            op_desc.set_attr(name, val)
+    return op_desc
+def _infer_var_data_type_(var_name, block):
+    grad_var = block.desc.find_var(var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(var_name.encode("ascii"))
+    if block.desc.has_var_recursive(fwd_name):
+        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        grad_var.set_dtype(fwd_var.dtype())
+    else:
+        grad_var.set_dtype(core.DataType.FP32)
+def _all_in_set_(cands, s):
+    for c in cands:
+        if not c in s:
+            return False
+    return True
+def _strip_grad_suffix_(name):
+    pos = name.find(core.grad_var_suffix())
+    return name[:pos] if pos != -1 else name
+def _append_grad_suffix_(name):
+    return name + core.grad_var_suffix()
+def _addup_repetitive_outputs_(op_descs):
+    # In backward part, an variable my be the output of more than one ops.
+    # In this case, the variable should be the accumulation of all the outputs.
+    # We adopt adding `sum_op`s to implement the accumulate.
+    pending_sum_ops = []
+    var_rename_count = collections.defaultdict(int)
+    renamed_vars = collections.defaultdict(list)
+    for idx, op_desc in enumerate(op_descs):
+        for var_name in op_desc.input_arg_names():
+            if len(renamed_vars[var_name]) > 1:
+                pending_sum_ops.append(
+                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                                      {"Out": [var_name]}, {}), idx))
+                renamed_vars[var_name] = [var_name]
+        for var_name in op_desc.output_arg_names():
+            if var_name == core.empty_var_name(
+            ) or var_name in op_desc.input_arg_names():
+                # empty variable or inplace op
+                continue
+            if len(renamed_vars[var_name]) == 0:
+                # it's the first time we get the variable
+                renamed_vars[var_name] = [var_name]
+            else:
+                if len(renamed_vars[var_name]) == 1:
+                    new_name = var_name + "@RENAME@" + \
+                        str(var_rename_count[var_name])
+                    var_rename_count[var_name] += 1
+                    # rename original var_name
+                    renamed_vars[var_name][0] = new_name
+                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    _rename_arg_(pending_sum_ops, var_name, new_name)
+                new_name = var_name + "@RENAME@" + \
+                    str(var_rename_count[var_name])
+                var_rename_count[var_name] += 1
+                op_desc.rename_output(var_name, new_name)
+                renamed_vars[var_name].append(new_name)
+    for var_name, inputs in renamed_vars.iteritems():
+        if len(inputs) > 1:
+            pending_sum_ops.append((_create_op_desc_(
+                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+    # sum_op descs are sorted according to their insert position
+    for p in reversed(pending_sum_ops):
+        op_descs.insert(p[1], p[0])
+    return op_descs
+def _remove_no_grad_branch_(op_descs, no_grad_set):
+    # Remove ops whose outputs are all in no_grad_dict
+    op_descs = filter(
+        lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set),
+        op_descs)
+    # Insert fill_zeros_like_op
+    to_insert = []
+    for idx, op_desc in enumerate(op_descs):
+        for arg in op_desc.input_arg_names():
+            if core.grad_var_suffix() in arg and arg in no_grad_set:
+                to_insert.append((_create_op_desc_("fill_zeros_like", {
+                    "X": [_strip_grad_suffix_(arg)]
+                }, {"Y": [arg]}, {}), idx))
+    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+    return op_descs
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var,
+                          callback=None):
+    grad_op_descs = []
+    program = block.program
+    for op in reversed(block.ops):
+        grad_sub_block_list = []
+        # If the op has its own sub-block, deal with the sub-block first
+        if op.has_attr("sub_block"):
+            sub_block = program.block(op.block_attr("sub_block"))
+            grad_sub_block = program.create_block(parent_idx=sub_block.idx)
+            _append_backward_ops_(target, sub_block, grad_sub_block,
+                                  no_grad_dict, grad_to_var, callback)
+            grad_sub_block_list.append(grad_sub_block.desc)
+        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
+    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
+                                            no_grad_dict[block.idx])
+    if target_block.idx == 0:
+        grad_op_descs.insert(
+            0,
+            _create_op_desc_("fill_constant", {}, {
+                "Out": [_append_grad_suffix_(target.name)]
+            }, {"shape": [1],
+                "value": 1.0,
+                "dtype": target.dtype}))
+    # append op_desc in grad_op_descs to target_block
+    for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        if op_desc.has_attr("sub_block"):
+            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
+        new_vars = set()
+        # create new gradient variables
+        for grad_var_name in op_desc.output_arg_names():
+            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(
+                    grad_var_name) or grad_var_name == core.empty_var_name():
+                continue
+            block.desc.var(grad_var_name)
+            new_vars.add(grad_var_name)
+            if not grad_to_var.has_key(grad_var_name):
+                continue
+            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
+        # infer_shape and infer_type
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_(arg, block)
+def append_backward(loss, parameter_list=None, no_grad_set=None):
    """
    Create and add gradient Operators in BlockDesc to compute
    gradients of `loss` for parameters in parameter_list
    :param loss: an variable generated by cost function.
    :type loss: Variable
-    :param no_grad_set: variable that should not create gradient
+    :param no_grad_dict: variable that should not create gradient
-    :type no_grad_set: set
+    :type no_grad_dict: set
    :param parameter_list: parameters that need to compute gradient and 
    update to optimize the lost.
    :type: list
@@ -20,35 +212,53 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
    """
    assert isinstance(loss, framework.Variable)
+    program = loss.block.program
+    no_grad_dict = dict()
    if no_grad_set is None:
-        program = loss.block.program
        assert isinstance(program, framework.Program)
-        no_grad_set = list()
        for block in program.blocks:
            assert isinstance(block, framework.Block)
+            block_no_grad_set = set()
            for var in block.vars.itervalues():
                assert isinstance(var, framework.Variable)
                if var.stop_gradient:
-                    no_grad_set.append(var.name)
+                    block_no_grad_set.add(_append_grad_suffix_(var.name))
-        no_grad_set = set(no_grad_set)
+            no_grad_dict[block.idx] = block_no_grad_set
+    elif isinstance(no_grad_set, set):
+        no_grad_dict = {0: no_grad_set}
+    else:
+        raise ValueError("'no_grad_set' should be a set or None.")
+    grad_info_map = dict()
+    root_block = program.block(0)
+    fwd_op_num = root_block.desc.op_size()
+    current_block_idx = program.current_block_idx
+    grad_to_var = dict()
+    _append_backward_ops_(loss, root_block, root_block, no_grad_dict,
+                          grad_to_var)
+    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
+    program.current_block_idx = current_block_idx
+    program.sync_with_cpp()
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
    if parameter_list is not None:
        parameters = parameter_list
    else:
-        params = loss.block.program.global_block().all_parameters()
+        params = program.global_block().all_parameters()
        parameters = [param.name for param in params]
    params_and_grads = []
    for param in parameters:
-        if param not in param_grad_map:
+        if param not in grad_info_map:
            raise ValueError("param %s is not in map" % param)
-        grad_info = param_grad_map[param]
+        grad_info = grad_info_map[param]
-        grad_block = loss.block.program.block(grad_info[1])
+        grad_block = grad_info[1]
        if not grad_block.has_var(grad_info[0]):
            raise ValueError("grad block[{0}] did not have grad var {1}".format(
                grad_info[1], grad_info[0]))
        # Get the param var from the global block
-        param_var = loss.block.program.global_block().var(param)
+        param_var = program.global_block().var(param)
        grad_var = grad_block.var(grad_info[0])
        if loss.block.has_var(grad_info[0]):
            params_and_grads.append((param_var, grad_var))

--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -3,7 +3,7 @@ import core
 import numpy
 import six.moves as six
-from framework import Variable
+from framework import Variable, default_main_program
 __all__ = ['DataFeeder']
@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):
 class DataFeeder(object):
-    def __init__(self, feed_list, place):
+    def __init__(self, feed_list, place, program=None):
        self.feed_dtypes = []
        self.feed_names = []
        self.feed_shapes = []
        self.feed_lod_level = []
+        if program is None:
+            program = default_main_program()
        for each_var in feed_list:
+            if isinstance(each_var, basestring):
+                each_var = program.block(0).var(each_var)
            if not isinstance(each_var, Variable):
                raise TypeError("Feed list should contain a list of variable")
            self.feed_dtypes.append(each_var.dtype)

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -846,9 +846,11 @@ class Program(object):
        self.sync_with_cpp()
        return param_to_grad_info
-    def create_block(self):
+    def create_block(self, parent_idx=None):
        new_block_idx = len(self.blocks)
-        self.desc.append_block(self.current_block().desc)
+        parent = self.current_block() if parent_idx is None else self.block(
+            parent_idx)
+        self.desc.append_block(parent.desc)
        self.current_block_idx = new_block_idx
        self.blocks.append(Block(self, self.current_block_idx))
        return self.current_block()

--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -188,7 +188,7 @@ def save_inference_model(dirname,
            raise ValueError("'feed_var_names' should be a list of str.")
    if isinstance(target_vars, Variable):
-        feeded_var_names = [feeded_var_names]
+        target_vars = [target_vars]
    else:
        if not (bool(target_vars) and all(
                isinstance(var, Variable) for var in target_vars)):

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -270,6 +270,7 @@ def gru_unit(input,
            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
    # create bias
    if bias is None:
        bias_size = [1, 3 * size]
        bias = helper.create_parameter(
@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs):
 def cross_entropy(input, label, **kwargs):
    """
-    This function computes cross_entropy using the input and label.
+    **Cross Entropy Layer**
+    This layer computes the cross entropy between `input` and `label`. It supports
+    both standard cross-entropy and soft-label cross-entropy loss computation.
+    1) One-hot cross-entropy:
+	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+        .. math::
+            Y[i] = -\log(X[i, Label[i]])
+    2) Soft-label cross-entropy:
+	`soft_label = True`, `Label[i, j]` indicates the soft label of class j
+	for sample i:
+        .. math::
+            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+       Please make sure that in this case the summation of each row of `label`
+       equals one.
+    3) One-hot cross-entropy with vecterized `label`:
+	 As a special case of 2), when each row of 'label' has only one
+	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
+         to a one-hot cross-entropy with one-hot label representation.
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the 
+            batch size and D is the number of classes. This input is a probability 
+            computed by the previous operator, which is almost always the result
+            of a softmax operator.
+        label (Variable|list): the ground truth which is a 2-D tensor. When 
+              `soft_label` is set to `False`, `label` is a tensor<int64> with shape 
+              [N x 1]. When `soft_label` is set to `True`, `label` is a 
+              tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
+              the given labels as soft labels, default `False`.
+    Returns:
+         A 2-D tensor with shape [N x 1], the cross entropy loss.
+    Raises:
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ 
+              `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
+               equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
+    Examples:
+        .. code-block:: python
+          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
    """
    helper = LayerHelper('cross_entropy', **kwargs)
    out = helper.create_tmp_variable(dtype=input.dtype)
@@ -514,14 +567,83 @@ def conv2d(input,
           groups=None,
           param_attr=None,
           bias_attr=None,
-           act=None,
+           act=None):
-           name=None):
    """
-    This function creates the op for a 2-dimensional Convolution.
+    **Convlution2D Layer**
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
+    The convolution2D layer calculates the output based on the input, filter
-    This funciton can also append an activation on top of the
+    and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
-    conv-2d output, if mentioned in the input parameters.
+    are in NCHW format. Where N is batch size, C is the number of channels, H is the height
+    of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the output of the convolution,
+    and the corresponding activation function is applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    In the above equation:
+        * :math:`X`: Input value, a tensor with NCHW format.
+        * :math:`W`: Filter value, a tensor with MCHW format.
+        * :math:`\\ast`: Convolution operation.
+        * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+        * :math:`\\sigma`: Activation function.
+        * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        Input:
+            Input shape: $(N, C_{in}, H_{in}, W_{in})$
+            Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+        Output:
+            Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+    .. math::
+        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        groups(int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act(str): Activation type. Default: None
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
    """
    if stride is None:

--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
 from collections import defaultdict
 import framework
-from backward import append_backward_ops
+from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
@@ -194,10 +194,10 @@ class Optimizer(object):
                 no_grad_set=None):
        """Add operations to minimize `loss` by updating `parameter_list`.
-        This method combines interface `append_backward_ops()` and
+        This method combines interface `append_backward()` and
        `create_optimization_pass()` into one.
        """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+        params_grads = append_backward(loss, parameter_list, no_grad_set)
        params_grads = append_gradient_clip_ops(params_grads)

--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -4,7 +4,7 @@ import random
 import itertools
 import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.op import Operator
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.framework import Program, OpProtoHolder
@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase):
            op_loss.desc.infer_var_type(block.desc)
            op_loss.desc.infer_shape(block.desc)
-        param_grad_list = append_backward_ops(
+        param_grad_list = append_backward(
            loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
        feed_dict = {

--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy
@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
-        append_backward_ops(total_sum_scaled)
+        append_backward(total_sum_scaled)
        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])

--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
        print outs
        loss = layers.mean(x=out)
-        append_backward_ops(loss=loss)
+        append_backward(loss=loss)
        outs = exe.run(
            feed={'X': x},
            fetch_list=[

--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -4,7 +4,7 @@ import numpy
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
            mean = layers.mean(x=result)
-            append_backward_ops(mean)
+            append_backward(mean)
        tensor = core.LoDTensor()
        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)

--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 class TestOptimizer(unittest.TestCase):
@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
            dtype="float32", shape=[1], lod_level=0, name="mean.out")
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
        opts = momentum_optimizer.create_optimization_pass(
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
        learning_rate = 0.01
        momentum_optimizer = self.MockMomentum(
            learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
        opts = momentum_optimizer.create_optimization_pass(
@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
        learning_rate = 0.01
        adagrad_optimizer = self.MockAdagrad(
            learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
        learning_rate = 0.01
        adam_optimizer = self.MockAdam(
            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
        learning_rate = 0.01
        adamax_optimizer = self.MockAdamax(
            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        learning_rate = 0.01
        decayed_adagrad_optimizer = self.MockDecayedAdagrad(
            learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
        opts = decayed_adagrad_optimizer.create_optimization_pass(

--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core
@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
    def test_backward(self):
        self.check_forward()
-        append_backward_ops(self.output)
+        append_backward(self.output)
        ana_grad = [np.array(x) for x in self.backward()]

--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -3,7 +3,7 @@ import unittest
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.regularizer as regularizer
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 class TestL2DecayRegularizer(unittest.TestCase):
@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
            dtype="float32", shape=[1], lod_level=0, name="mean.out")
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        count_ops = len(block.ops)
        params_grads = optimizer.append_regularization_ops(params_grads)
@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
            dtype="float32", shape=[1], lod_level=0, name="mean.out")
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        params_grads = append_backward_ops(mean_out)
+        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        count_ops = len(block.ops)
        params_grads = optimizer.append_regularization_ops(params_grads)

--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase):
        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
            x=dat, rank_table=table)
        loss = fluid.layers.mean(x=new_dat)
-        fluid.backward.append_backward_ops(loss=loss)
+        fluid.backward.append_backward(loss=loss)
        cpu = fluid.CPUPlace()
        exe = fluid.Executor(cpu)

--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -2,7 +2,7 @@ import unittest
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy as np
 import paddle.v2.fluid.core as core

--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 from paddle.v2.fluid.framework import default_main_program
 import numpy
@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
        mem3_mean = layers.mean(x=mem3)
-        append_backward_ops(loss=mem3_mean)
+        append_backward(loss=mem3_mean)
        x_grad = exe.run(
            feed={'x': tensor},
            fetch_list=[main_program.global_block().var('x@GRAD')])[0]

--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -4,7 +4,7 @@ import numpy as np
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
            mean = layers.mean(x=out)
-            append_backward_ops(mean)
+            append_backward(mean)
        tensor = core.LoDTensor()
        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)

--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward
 import numpy
@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
        sum_result = layers.array_read(array=mem_array, i=i)
        loss = layers.mean(x=sum_result)
-        append_backward_ops(loss)
+        append_backward(loss)
        cpu = core.CPUPlace()
        exe = Executor(cpu)