Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cross_channel_norm

77b0bf41 · sweetsky0901 · d18d7aa9 · ea5d6eae · 77b0bf41 · 77b0bf41
151 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 include(system)
@@ -201,6 +199,10 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 add_subdirectory(paddle)
 if(WITH_PYTHON)
  add_subdirectory(python)

--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -291,10 +291,10 @@ public:
  }
  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
    for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
    }
  }

--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de
 ### Place and DeviceContext
+Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
 ```
-        |   CPUPlace   --> MKLDNNPlace
+        |   CPUPlace
-Place --|   CUDAPlace  --> CUDNNPlace
+Place --|   CUDAPlace
        |   FPGAPlace
 ```
@@ -43,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
 #### DeviceContext
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
 ```
@@ -106,7 +107,7 @@ template <typename Place>
 size_t Used(Place place);
 ```
-To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
 #### Tensor
@@ -243,6 +244,7 @@ REGISTER_OP_CUDA_KERNEL(
 Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
-We will discuss how to implement an efficient OpKernel switch policy. 
+For more details, please refer to following docs:
- TBD
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -59,5 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
-cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_test(threadpool_test SRCS threadpool_test.cc)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
 cc_test(init_test SRCS init_test.cc DEPS init)
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context)
--- a/paddle/framework/data_layout.h
+++ b/paddle/framework/data_layout.h
@@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
+#include <iostream>
+#include "paddle/platform/enforce.h"
 namespace paddle {
 namespace framework {
@@ -33,5 +36,23 @@ inline DataLayout StringToDataLayout(const std::string& str) {
  }
 }
+inline std::string DataLayoutToString(const DataLayout& data_layout) {
+  switch (data_layout) {
+    case kNHWC:
+      return "NHWC";
+    case kNCHW:
+      return "NCHW";
+    case kAnyLayout:
+      return "ANY_LAYOUT";
+    default:
+      PADDLE_THROW("unknown DataLayou %d", data_layout);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out, DataLayout l) {
+  out << DataLayoutToString(l);
+  return out;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,13 +33,7 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
-DeviceContextPool* DeviceContextPool::pool = nullptr;
+Executor::Executor(const platform::Place& place) : place_(place) {}
-Executor::Executor(const std::vector<platform::Place>& places) {
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto borrowed_contexts = pool.Borrow(places);
-  device_contexts_.swap(borrowed_contexts);
-}
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  if (var_type == proto::VarDesc::LOD_TENSOR) {
@@ -71,7 +65,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  //    - will change to use multiple blocks for RNN op and Cond Op
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
  auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];
  Scope* local_scope = scope;
  if (create_vars) {
@@ -107,7 +100,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugString();
-    op->Run(*local_scope, *device);
+    op->Run(*local_scope, place_);
  }
  if (create_local_scope) {
    scope->DeleteScope(local_scope);

--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -14,9 +14,6 @@ limitations under the License. */
 #pragma once
-#include <map>
-#include <unordered_map>
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
@@ -26,96 +23,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class DeviceContextPool {
- public:
-  static DeviceContextPool& Get() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-  const platform::DeviceContext* Borrow(const platform::Place& place) {
-    auto range = device_contexts_.equal_range(place);
-    if (range.first == range.second) {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-    return range.first->second;
-  }
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-    std::vector<const platform::DeviceContext*> borrowed_contexts;
-    for (auto& place : places) {
-      auto range = device_contexts_.equal_range(place);
-      if (range.first == range.second) {
-        PADDLE_THROW(
-            "'Place' is not supported, Please re-compile with WITH_GPU "
-            "option");
-      }
-      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
-      // device load balancer maybe useful here.
-      borrowed_contexts.emplace_back(range.first->second);
-    }
-    return borrowed_contexts;
-  }
-  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    for (size_t i = 0; i < places.size(); i++) {
-      if (platform::is_cpu_place(places[i])) {
-        device_contexts_.emplace(
-            places[i], new platform::CPUDeviceContext(
-                           boost::get<platform::CPUPlace>(places[i])));
-      } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-        device_contexts_.emplace(
-            places[i], new platform::CUDADeviceContext(
-                           boost::get<platform::GPUPlace>(places[i])));
-#else
-        PADDLE_THROW(
-            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-            "option");
-#endif
-      }
-    }
-  }
-  ~DeviceContextPool() {}
- private:
-  static DeviceContextPool* pool;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      return hash_(place.which());
-    }
-  };
-  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
-                          Hash>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
 class Executor {
 public:
  // TODO(dzhwinter) : Do not rely on this function, it will be removed
  explicit Executor(const platform::DeviceContext& device)
-      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
+      : Executor(device.GetPlace()) {}
-  explicit Executor(const platform::Place& place)
-      : Executor(std::vector<platform::Place>({place})) {}
-  explicit Executor(const std::vector<platform::Place>& places);
+  explicit Executor(const platform::Place& place);
  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -128,7 +42,7 @@ class Executor {
           bool create_vars = true);
 private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
+  const platform::Place place_;
 };
 }  // namespace framework

--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -14,8 +14,8 @@
 #include <algorithm>
 #include <string>
-#include "paddle/framework/executor.h"
 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"
@@ -48,13 +48,13 @@ bool InitDevices(const std::vector<std::string> &devices) {
  std::vector<platform::Place> places;
  for (auto &device : devices) {
    auto p = string::Piece(device);
-    if (string::Find(p, ':', 0) == string::Piece::npos) {
+    if (string::HasPrefix(p, "CPU")) {
      places.emplace_back(platform::CPUPlace());
    } else if (string::HasPrefix(p, "GPU")) {
 #ifdef PADDLE_WITH_CUDA
      auto pos = string::RFind(p, ':', string::Piece::npos);
      auto number = device.substr(pos + 1);
-      places.emplace_back(platform::GPUPlace(std::stoi(number)));
+      places.emplace_back(platform::CUDAPlace(std::stoi(number)));
 #else
      LOG(WARNING)
          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
@@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
                     return platform::is_cpu_place(place);
                   }) == places.end()) {
    places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
  }
-  DeviceContextPool::Create(places);
+  platform::DeviceContextPool::Create(places);
-  return true;
  return true;
 }

--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
 #ifdef PADDLE_WITH_CUDA
  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
  ASSERT_EQ(InitDevices(ds2), true);
+  // test re-init
+  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds3), true);
 #endif
 }
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
@@ -20,7 +20,25 @@ namespace framework {
 // For more details about the design of LibraryType, Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
-enum LibraryType { kPlain = 0; kMKLDNN = 1; kCUDNN = 2; }
+enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 };
+inline std::string LibraryTypeToString(const LibraryType& library_type) {
+  switch (library_type) {
+    case kPlain:
+      return "PLAIN";
+    case kMKLDNN:
+      return "MKLDNN";
+    case kCUDNN:
+      return "CUDNN";
+    default:
+      PADDLE_THROW("unknown LibraryType %d", library_type);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
+  out << LibraryTypeToString(l);
+  return out;
+}
 }  // namespace
 }  // framework
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::GPUPlace>(tensor.place()),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
                     reinterpret_cast<const void *>(data), size_to_write,
                     gpu_dev_ctx.stream());
        gpu_dev_ctx.Wait();

--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) {
 TEST(LoDTensor, LoDInGPU) {
  paddle::framework::LoDTensor lod_tensor;
-  paddle::platform::GPUPlace place(0);
+  paddle::platform::CUDAPlace place(0);
  paddle::framework::LoD src_lod;
  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});

--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/data_layout.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/library_type.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+namespace paddle {
+namespace framework {
+struct OpKernelType {
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const {
+      int place = key.place_.which() + (1 << LEFT_SHIFT);
+      int data_type =
+          static_cast<int>(key.data_type_) + (1 << (LEFT_SHIFT + 1));
+      int data_layout =
+          static_cast<int>(key.data_layout_) + (1 << (LEFT_SHIFT + 2));
+      int library_type =
+          static_cast<int>(key.library_type_) + (1 << (LEFT_SHIFT + 3));
+      std::hash<int> hasher;
+      return hasher(place + data_type + data_layout + library_type);
+    }
+  };
+  // place, data_type, library_type kinds less than 2^8
+  constexpr static int LEFT_SHIFT = 8;
+  proto::DataType data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+  OpKernelType(proto::DataType data_type, platform::Place place,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(place),
+        library_type_(library_type) {}
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(dev_ctx.GetPlace()),
+        library_type_(library_type) {}
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+           library_type_ == o.library_type_;
+  }
+};
+inline std::ostream& operator<<(std::ostream& os,
+                                const OpKernelType& kernel_key) {
+  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
+     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
+     << "]:library_type[" << kernel_key.library_type_ << "]";
+  return os;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_kernel_type.h"
+#include <gtest/gtest.h>
+#include <iostream>
+TEST(OpKernelType, ToString) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                              LibraryType::kCUDNN);
+  std::ostringstream stream;
+  stream << op_kernel_type;
+  ASSERT_EQ(
+      stream.str(),
+      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+}
+TEST(OpKernelType, Hash) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+  OpKernelType::Hash hasher;
+  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
+}
\ No newline at end of file
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -61,17 +61,6 @@ struct OperatorRegistrar : public Registrar {
 class OpRegistry {
 public:
-  template <typename OpType, typename ProtoMakerType, typename GradOpType>
-  static void RegisterOp(const std::string& op_type,
-                         const std::string& grad_op_type) {
-    OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
-    reg.info.grad_op_type_ = grad_op_type;
-    // register gradient op
-    if (!grad_op_type.empty()) {
-      OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
-    }
-  }
  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
                                                const VariableNameMap& inputs,
                                                const VariableNameMap& outputs,
@@ -199,7 +188,7 @@ class OpKernelRegistrar : public Registrar {
  }
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -8,8 +8,7 @@ namespace framework {
 class CosineOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const platform::Place& place) const override {}
-           const platform::DeviceContext& dev_ctx) const override {}
 };
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const platform::Place& place) const override {}
-           const platform::DeviceContext& dev_ctx) const override {}
 };
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) {
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
  float scale_get = op->Attr<float>("scale");
  ASSERT_EQ(scale_get, scale);
 }
@@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
@@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_type(paddle::framework::proto::AttrType::INT);
  attr->set_i(4);
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
  int test_attr = op->Attr<int>("test_attr");
  ASSERT_EQ(test_attr, 4);
 }

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
@@ -240,12 +242,6 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
  return res;
 }
-std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
-  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
-     << "]";
-  return os;
-}
 bool OpSupportGPU(const std::string& op_type) {
  auto& all_kernels = OperatorWithKernel::AllOpKernels();
  auto it = all_kernels.find(op_type);
@@ -388,11 +384,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
+                             const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
-  ExecutionContext ctx(*this, scope, dev_ctx);
+  auto dev_ctx = pool.Borrow(place);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
@@ -404,19 +400,30 @@ void OperatorWithKernel::Run(const Scope& scope,
  // check if op[type] have kernel for kernel_key
  OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = GetKernelType(ctx);
-  auto kernel_iter = kernels.find(kernel_key);
+  ExecutionContext ctx(*this, scope, *dev_ctx);
+  auto actual_kernel_key = GetActualKernelType(ctx);
+  auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
+  auto kernel_iter = kernels.find(expected_kernel_key);
  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("The operator %s does not support %s", type_, kernel_key);
+    PADDLE_THROW("The operator %s does not support %s", type_,
+                 expected_kernel_key);
  }
  kernel_iter->second->Compute(ctx);
 }
-OpKernelType OperatorWithKernel::GetKernelType(
+OpKernelType OperatorWithKernel::GetActualKernelType(
    const ExecutionContext& ctx) const {
  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const OpKernelType& actual_kernel_type) const {
+  return actual_kernel_type;
+}
 proto::DataType OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
  auto& scope = ctx.scope();

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -23,15 +23,14 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/block_desc.h"
-#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_info.h"
+#include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
 #include "paddle/platform/variant.h"
 #include "paddle/utils/Error.h"
@@ -53,6 +52,11 @@ constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
+// define some kernel hint
+const std::string kUseCPU = "use_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
 inline std::string GradVarName(const std::string& var_name) {
  return var_name + kGradVarSuffix;
 }
@@ -83,8 +87,7 @@ class OperatorBase {
  virtual std::string DebugString() const;
  /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope,
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
-                   const platform::DeviceContext& dev_ctx) const = 0;
  virtual bool IsNetOp() const { return false; }
@@ -159,8 +162,7 @@ class OperatorBase {
 class NOP : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const platform::Place& place) const override {}
-           const platform::DeviceContext& dev_ctx) const override {}
  std::unique_ptr<OperatorBase> Clone() const override {
    return std::unique_ptr<OperatorBase>(new NOP(*this));
  }
@@ -345,34 +347,6 @@ class OpKernel : public OpKernelBase {
  using ELEMENT_TYPE = T;
 };
-struct OpKernelType {
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_);
-      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
-                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
-      return hash_(pre_hash);
-    }
-  };
-  platform::Place place_;
-  proto::DataType data_type_;
-  OpKernelType(proto::DataType data_type, platform::Place place)
-      : place_(place), data_type_(data_type) {}
-  OpKernelType(proto::DataType data_type,
-               const platform::DeviceContext& dev_ctx)
-      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_;
-  }
-};
 class OperatorWithKernel : public OperatorBase {
 public:
  using OpKernelMap =
@@ -383,8 +357,7 @@ class OperatorWithKernel : public OperatorBase {
                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const platform::Place& place) const final;
-           const platform::DeviceContext& dev_ctx) const final;
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
  AllOpKernels() {
@@ -405,7 +378,9 @@ class OperatorWithKernel : public OperatorBase {
  }
 protected:
-  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetExpectedKernelType(
+      const OpKernelType& actual_kernel_type) const;
 private:
  // indicate kernel DataType by input data. Defaultly all input data must be
@@ -413,8 +388,6 @@ class OperatorWithKernel : public OperatorBase {
  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
 };
-std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
 extern bool OpSupportGPU(const std::string& op_type);
 }  // namespace framework

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const platform::Place& place) const override {
-           const platform::DeviceContext& dev_ctx) const override {
    ++op_run_num;
    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
  int x{0};
 };
-class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
 public:
-  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-                                           OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
@@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name,
  }
 }
-REGISTER_OP_WITHOUT_GRADIENT(
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
-    test_operator, paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelTest,
-    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+                             paddle::framework::OpWithoutKernelCheckerMaker);
 TEST(OperatorBase, all) {
+  paddle::framework::InitDevices({"CPU"});
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -80,13 +80,13 @@ TEST(OperatorBase, all) {
  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);
-  paddle::platform::CPUDeviceContext device_context;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  scope.Var("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, device_context);
+  op->Run(scope, cpu_place);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
@@ -114,7 +114,7 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
+  OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override {
    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
  }
 };
@@ -123,7 +123,6 @@ template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
-    std::cout << "this is cpu kernel" << std::endl;
    std::cout << ctx.op().DebugString() << std::endl;
    cpu_kernel_run_num++;
    ASSERT_EQ(ctx.op().Input("x"), "IN1");
@@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 // test with single input
 TEST(OpKernel, all) {
+  paddle::framework::InitDevices({"CPU"});
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -205,12 +205,12 @@ TEST(OpKernel, all) {
  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
@@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;
+  paddle::framework::InitDevices({"CPU"});
  proto::OpDesc op_desc;
  op_desc.set_type("op_multi_inputs_with_kernel");
  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
  BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) {
  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);
-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;
  scope.Var("x0")->GetMutable<LoDTensor>();
  scope.Var("x1")->GetMutable<LoDTensor>();
@@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) {
  scope.Var("y1")->GetMutable<LoDTensor>();
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
 }
 class OperatorClone : public paddle::framework::OperatorBase {
@@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase {
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {}
+           const paddle::platform::Place& place) const override {}
 };
 TEST(Operator, Clone) {
+  paddle::framework::InitDevices({"CPU"});
  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                  paddle::framework::VariableNameMap{},
                  paddle::framework::AttributeMap{});

--- a/paddle/framework/tensor.md
+++ b/paddle/framework/tensor.md
@@ -71,7 +71,7 @@ private:
 ```
 ```c++
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<CUDAPlace, CpuPlace> Place;
 typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
                       Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
 typedef boost::variant<

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
    }
 #else
-      holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
+      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::GPUPlace>(place), size, type));
+          boost::get<platform::CUDAPlace>(place), size, type));
    }
 #endif
    offset_ = 0;

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -80,20 +80,20 @@ TEST(Tensor, MutableData) {
    float* p1 = nullptr;
    float* p2 = nullptr;
    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
    EXPECT_NE(p1, nullptr);
    // set src_tensor a new dim with large size
    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
    EXPECT_NE(p2, nullptr);
    EXPECT_NE(p1, p2);
    // set src_tensor a new dim with same size
    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
    EXPECT_EQ(p1, p2);
    // set src_tensor a new dim with smaller size
    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
    EXPECT_EQ(p1, p2);
  }
 #endif
@@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) {
  {
    Tensor src_tensor;
    Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
@@ -166,7 +166,7 @@ TEST(Tensor, Slice) {
 #ifdef PADDLE_WITH_CUDA
  {
    Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
    Tensor slice_tensor = src_tensor.Slice(2, 6);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
@@ -176,11 +176,11 @@ TEST(Tensor, Slice) {
    uintptr_t src_data_address =
        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+        src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
    uintptr_t slice_data_address =
        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
    EXPECT_EQ(src_data_address, src_mutable_data_address);
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);

--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  } else if (platform::is_gpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -82,6 +82,28 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
 #endif
 }
+/**
+ * @brief CopyFrom support CPU <-> CPU
+ */
+inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                     Tensor* dst) {
+  src.check_memory_size();
+  dst->Resize(src.dims());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  auto size = src.numel() * SizeOfType(src.type());
+  PADDLE_ENFORCE(platform::is_cpu_place(src_place) &&
+                 platform::is_cpu_place(dst_place));
+  memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+               boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+}
 /**
 * @brief   Copy the content of an external vector to a tensor.
 *
@@ -108,13 +130,28 @@ inline void CopyFromVector(const std::vector<T>& src,
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
    memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
        size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
 }
+/**
+ * @brief CopyFromVector CPU vector -> CPU Tensor
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+}
 /**
 * @brief   Copy the content of a tensor to a vector
 *
@@ -141,12 +178,30 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
        src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
 }
+/**
+ * @brief CopyToVector CPUTensor <-> CPU Vector
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+  PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
+  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
+               src_ptr, size);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -17,6 +17,7 @@
 namespace paddle {
 namespace framework {
 TEST(CopyFrom, Tensor) {
  Tensor src_tensor;
  Tensor dst_tensor;
@@ -29,7 +30,7 @@ TEST(CopyFrom, Tensor) {
  memcpy(src_ptr, arr, 9 * sizeof(int));
  auto cpu_place = new platform::CPUPlace();
-  CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  CopyFrom(src_tensor, *cpu_place, &dst_tensor);
  const int* dst_ptr = dst_tensor.data<int>();
  ASSERT_NE(src_ptr, dst_ptr);
@@ -58,7 +59,7 @@ TEST(CopyFrom, Tensor) {
    memcpy(src_ptr, arr, 9 * sizeof(int));
    // CPU Tensor to GPU Tensor
-    auto gpu_place = new platform::GPUPlace(0);
+    auto gpu_place = new platform::CUDAPlace(0);
    platform::CUDADeviceContext gpu_ctx(*gpu_place);
    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
@@ -104,8 +105,7 @@ TEST(CopyFromVector, Tensor) {
    // Copy to CPU Tensor
    cpu_tensor.Resize(make_ddim({3, 3}));
    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, &cpu_tensor);
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
    // Compare Tensors
    const int* cpu_ptr = cpu_tensor.data<int>();
@@ -117,7 +117,7 @@ TEST(CopyFromVector, Tensor) {
    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
    cpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    CopyFromVector<int>(src_vec, &cpu_tensor);
    cpu_ptr = cpu_tensor.data<int>();
    src_ptr = src_vec.data();
    ASSERT_NE(src_ptr, cpu_ptr);
@@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) {
    // Copy to GPUTensor
    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
+    auto gpu_place = new paddle::platform::CUDAPlace();
    CUDADeviceContext gpu_ctx(*gpu_place);
    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
    // Copy from GPU to CPU tensor for comparison
@@ -198,9 +198,8 @@ TEST(CopyToVector, Tensor) {
    }
    CPUPlace place;
-    CPUDeviceContext cpu_ctx(place);
    std::vector<int> dst;
-    CopyToVector<int>(src, cpu_ctx, &dst);
+    CopyToVector<int>(src, &dst);
    for (int i = 0; i < 3 * 3; ++i) {
      EXPECT_EQ(src_ptr[i], dst[i]);
@@ -210,7 +209,7 @@ TEST(CopyToVector, Tensor) {
  {
    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
    Tensor gpu_tensor;
-    GPUPlace place;
+    CUDAPlace place;
    CUDADeviceContext gpu_ctx(place);
    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <condition_variable>
+#include <cstdio>
+#include <functional>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include "paddle/platform/call_once.h"
+#include "paddle/platform/enforce.h"
+namespace paddle {
+namespace framework {
+typedef std::function<void()> Task;
+class ThreadPool {
+ public:
+  /**
+   * @brief   Get a instance of threadpool, the thread number will
+   *          be specified as the number of hardware thread contexts
+   */
+  static ThreadPool* GetInstance() {
+    std::call_once(init_flag, &ThreadPool::Init);
+    return threadpool.get();
+  }
+  ~ThreadPool() {
+    {
+      // notify all threads to stop running
+      running_ = false;
+      scheduled_.notify_all();
+    }
+    for (auto& t : threads_) {
+      t->join();
+      t.reset(nullptr);
+    }
+  }
+  int GetNumThreads() const { return num_threads_; }
+  int GetAvailable() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return available_;
+  }
+  /**
+   * @brief   Push a function to the queue, and will be scheduled and
+   *          executed if a thread is available.
+   * @param[in] Task  will be pushed to the task queue.
+   */
+  void Run(const Task& fn) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    tasks_.push(fn);
+    lock.unlock();
+    scheduled_.notify_one();
+  }
+  /**
+   * @brief   Wait until all the tasks are completed.
+   */
+  void Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    completed_.wait(lock, [=] { return Done() == true; });
+  }
+ private:
+  ThreadPool& operator=(const ThreadPool&) = delete;
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool(int num_threads)
+      : num_threads_(num_threads), available_(num_threads), running_(true) {
+    threads_.resize(num_threads);
+    for (auto& thread : threads_) {
+      // TODO(Yancey1989): binding the thread on the specify CPU number
+      thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+    }
+  }
+  /**
+   * @brief   If the task queue is empty and avaialbe
+   *          is equal to the number of threads, means that
+   *          all tasks are completed.
+   *
+   *          Note: this function is not thread-safe.
+   *
+   * @return true if all tasks are completed.
+   */
+  bool Done() { return tasks_.empty() && available_ == num_threads_; }
+  void TaskLoop() {
+    while (running_) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+      if (!running_) {
+        break;
+      }
+      // pop a task from the task queue
+      auto task = tasks_.front();
+      tasks_.pop();
+      --available_;
+      lock.unlock();
+      // run the task
+      task();
+      {
+        std::unique_lock<std::mutex> lock(mutex_);
+        ++available_;
+        if (Done()) {
+          completed_.notify_all();
+        }
+      }
+    }
+  }
+  static void Init() {
+    if (threadpool.get() == nullptr) {
+      // TODO(Yancey1989): specify the max threads number
+      int num_threads = std::thread::hardware_concurrency();
+      PADDLE_ENFORCE_GT(num_threads, 0);
+      threadpool.reset(new ThreadPool(num_threads));
+    }
+  }
+ private:
+  static std::unique_ptr<ThreadPool> threadpool;
+  static std::once_flag init_flag;
+  int num_threads_;
+  int available_;
+  bool running_;
+  std::queue<Task> tasks_;
+  std::vector<std::unique_ptr<std::thread>> threads_;
+  std::mutex mutex_;
+  std::condition_variable scheduled_;
+  std::condition_variable completed_;
+};
+std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
+std::once_flag ThreadPool::init_flag;
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "threadpool.h"
+#include <gtest/gtest.h>
+#include <atomic>
+#include <chrono>
+#include <map>
+#include <thread>
+namespace framework = paddle::framework;
+void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  for (int i = 0; i < cnt; ++i) {
+    pool->Run([&sum]() { sum.fetch_add(1); });
+  }
+}
+TEST(ThreadPool, ConcurrentInit) {
+  framework::ThreadPool* pool;
+  int concurrent_cnt = 50;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < concurrent_cnt; ++i) {
+    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+TEST(ThreadPool, ConcurrentStart) {
+  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
+  std::atomic<int> sum(0);
+  std::vector<std::thread> threads;
+  int concurrent_cnt = 50;
+  // sum = (n * (n + 1)) / 2
+  for (int i = 1; i <= concurrent_cnt; ++i) {
+    std::thread t(do_sum, pool, std::ref(sum), i);
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  pool->Wait();
+  EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2);
+}
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
 To allocate 4KB memory on the 3rd GPU:
 ```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
 ```
 To free memory and check the so-far used amount of memory on a place:
 ```cpp
-auto pl = platform::GPUPlace(0);
+auto pl = platform::CUDAPlace(0);
 p = memory::Alloc(pl, 4*1024);
 cout << memory::Used(pl);
 memory::Free(pl, p);
@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
 }  // namespace memory
 ```
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
 ```cpp
 template<>
@@ -49,7 +49,7 @@ and
 ```cpp
 template<>
-void Alloc<GPUPlace>(GPUPlace p, size_t size) {
+void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
  return GetGPUBuddyAllocator(p.id)->Alloc(size);
 }
 ```
@@ -122,7 +122,7 @@ There are two implementations of `Context`:
 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
 ### Majel

--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 #ifdef PADDLE_WITH_CUDA
 template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
-                                                  void* dst,
+    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-                                                  platform::GPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
  platform::SetDeviceId(src_place.device);
  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
-                                                  void* dst,
+    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-                                                  platform::CPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
  platform::SetDeviceId(dst_place.device);
  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+void Copy<platform::CUDAPlace, platform::CUDAPlace>(
-                                                  void* dst,
+    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-                                                  platform::GPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
-                                                  const void* src, size_t num,
-                                                  cudaStream_t stream) {
  if (dst_place == src_place) {
    platform::SetDeviceId(src_place.device);
    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 }
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
 }
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
  auto* ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
@@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
    LOG(WARNING) << "total " << total;
    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
    platform::SetDeviceId(cur_dev);
  }
  return ptr;
 }
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
  GetGPUBuddyAllocator(place.device)->Free(p);
 }

--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 #ifdef PADDLE_WITH_CUDA
-size_t align(size_t size, paddle::platform::GPUPlace place) {
+size_t align(size_t size, paddle::platform::CUDAPlace place) {
  size += sizeof(paddle::memory::detail::Metadata);
  size_t alignment = paddle::platform::GpuMinChunkSize();
  size_t remaining = size % alignment;
@@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) {
  EXPECT_EQ(p, nullptr);
-  paddle::platform::GPUPlace gpu(0);
+  paddle::platform::CUDAPlace gpu(0);
  p = paddle::memory::Alloc(gpu, 4096);
  EXPECT_NE(p, nullptr);
@@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) {
 }
 TEST(BuddyAllocator, GPUMultAlloc) {
-  paddle::platform::GPUPlace gpu;
+  paddle::platform::CUDAPlace gpu;
  std::unordered_map<void *, size_t> ps;

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto* inference = ctx.Input<Tensor>("Out");
    auto* indices = ctx.Input<Tensor>("Indices");
    auto* label = ctx.Input<Tensor>("Label");

--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase {
 protected:
  size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
+                   const platform::Place &place) const {
    auto *i = scope.FindVar(Input("I"));
    PADDLE_ENFORCE(i != nullptr, "I must be set");
    auto &i_tensor = i->Get<framework::LoDTensor>();
    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    size_t offset;
    if (platform::is_gpu_place(i_tensor.place())) {
      // FIXME: Avoid copy from GPU to CPU

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -12,10 +12,12 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include <numeric>
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
    auto &rank_table =
        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
@@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
          continue;
        }
        auto slice = out->Slice(out_offset, out_offset + len);
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                            dev_ctx, &slice);
        out_offset += len;

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
           const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    if (x == nullptr) {
      return;
@@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase {
    PADDLE_ENFORCE(
        out != nullptr,
        "The Output(Out) should not be null if the Input(X) is set.");
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
  }
 };

--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -304,7 +304,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
    if (var == nullptr) {

--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -53,7 +53,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -179,7 +179,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
    const DataLayout data_layout =

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                     const framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+    auto& dev_ctx = *pool.Borrow(dev_place);
    framework::ExecutionContext ctx(*this, scope, dev_ctx);
    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");

--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
  }
  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
    LOG(INFO) << "run beam search op";
    auto ids_var = scope.FindVar(Input("ids"));
    auto scores_var = scope.FindVar(Input("scores"));

--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -55,7 +55,7 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(framework::proto::DataType::FP32,
                                   ctx.device_context());

--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
    // CompareOp kernel's device type is decided by input tensor place
    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
    return kt;

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/cond_op.h"
 #include "paddle/operators/gather.h"
 #include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
  }
 }
-void CondOp::Run(const Scope& scope,
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
-                 const platform::DeviceContext& dev_ctx) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto& dev_ctx = *pool.Borrow(place);
  PrepareDataForSubnet(scope, dev_ctx);
  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
  }
  MergeDataFromSubnet(scope, dev_ctx);
 }

--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
  }
  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
+           const platform::Place& place) const override;
 private:
  const int TRUE_BRANCH = 0;

--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
                     const framework::AttributeMap &attrs)
      : ConditionalOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto xs = InputTensors(scope);
    bool need_run = std::all_of(
        xs.begin(), xs.end(),
@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
      scopes->front() = &scope.NewScope();
      auto &cur_scope = *scopes->front();
+      framework::Executor exec(dev_place);
      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
    }
  }
@@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                         const framework::AttributeMap &attrs)
      : ConditionalOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto xs = this->InputTensors(scope);
    bool need_run = std::all_of(
        xs.begin(), xs.end(),
@@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
      framework::Scope &cur_scope = *scopes[0];
+      framework::Executor exec(dev_place);
      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
                                  Outputs(framework::GradVarName("Params")));
-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
                                  Outputs(framework::GradVarName("X")));
    }
  }
 private:
  void AssignLocalGradientToGlobal(
-      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const platform::Place &place, const framework::Scope &cur_scope,
      const std::vector<std::string> &p_names,
      const std::vector<std::string> &pg_names) const {
    for (size_t i = 0; i < p_names.size(); ++i) {
@@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto assign = framework::OpRegistry::CreateOp(
          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
          framework::AttributeMap{});
-      assign->Run(cur_scope, dev_ctx);
+      assign->Run(cur_scope, place);
      cur_scope.Rename(new_in_grad_name, in_grad_name);
    }
  }

--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("Input");
    auto* filter = ctx.Input<Tensor>("Filter");
    auto* output = ctx.Output<Tensor>("Output");
@@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, algo, &workspace_size_in_bytes));
    // Allocate on GPU memory
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv forward ---------------------
    T alpha = 1.0f, beta = 0.0f;
@@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto input = ctx.Input<Tensor>("Input");
    auto filter = ctx.Input<Tensor>("Filter");
    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv workspace ---------------------
    // Already on GPU
    void* cudnn_workspace = nullptr;
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv backward data ---------------------
    T alpha = 1.0f, beta = 0.0f;

--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("Input");
    auto* filter = ctx.Input<Tensor>("Filter");
    auto* output = ctx.Output<Tensor>("Output");
@@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
            cudnn_output_desc, algo, &workspace_size_in_bytes));
    // Allocate on GPU memory
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv transpose forward ---------------------
@@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto input = ctx.Input<Tensor>("Input");
    auto filter = ctx.Input<Tensor>("Filter");
    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv workspace ---------------------
    // Already on GPU
    void* cudnn_workspace = nullptr;
-    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv backward data ---------------------
    // FIXME(typhoonzero): template type T may not be the same as cudnn call.

--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -120,12 +120,18 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
        ctx.device_context());
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::OpKernelType& actual_kernel_type) const override {
+    return framework::OpKernelType(actual_kernel_type.data_type_,
+                                   platform::CPUPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
 protected:
  // Explicitly set that the data type of computation kernel of cross_entropy
  // is determined by its input "X".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 protected:
  // Explicitly set that the data type of computation kernel of cross_entropy
  // is determined by its input "X".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),

--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> {
      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
    } else {
 #ifdef PADDLE_WITH_CUDA
-      auto& gpu_place = boost::get<platform::GPUPlace>(place);
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
      auto& cuda_ctx =
          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,

--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto feed_var_name = Input("X");
    auto *feed_var = scope.FindVar(feed_var_name);
@@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase {
    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
    auto &feed_item = feed_list.at(static_cast<size_t>(col));
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
    out_item->set_lod(feed_item.lod());
  }
 };

--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -14,6 +14,7 @@
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto fetch_var_name = Input("X");
    auto *fetch_var = scope.FindVar(fetch_var_name);
    PADDLE_ENFORCE(fetch_var != nullptr,
@@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase {
    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
    dev_ctx.Wait();
    dst_item.set_lod(src_item.lod());

--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto data_type =
        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    auto value = Attr<float>("value");
@@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase {
      auto cpu = platform::CPUPlace();
      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
    } else {
-      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
    }
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
    math::set_constant(dev_ctx, &out, value);
  }
 };

--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &out =
        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                "Cannot find variable %s", Output("Out"))
@@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase {
    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    platform::CPUPlace cpu;
    auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
-                     framework::ToTypeIndex(dtype));
    framework::LoDTensor tensor;
-    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+    if (force_cpu || platform::is_cpu_place(place)) {
      tensor.ShareDataWith(out);
    } else {
      // Always make tensor in CPU memory.
@@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase {
    framework::VisitDataType(
        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
-    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
-      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(tensor, place, dev_ctx, &out);
    }
  }
 };

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -57,7 +57,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),

--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &out =
        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    // get input
    auto *var = scope.FindVar(Input(kInput));
    PADDLE_ENFORCE_NOT_NULL(var);

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
 protected:
  // Explicitly set that the data type of computation kernel of linear_chain_crf
  // is determined by its input "Emission".
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 protected:
  // Explicitly set that the data type of output of the linear_chain_crf_grad
  // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(

--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
    // operators runs on GPU device.
    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                         Tensor* dst) {
-      dst->mutable_data<T>(platform::GPUPlace());
+      dst->mutable_data<T>(platform::CUDAPlace());
-      framework::CopyFrom(src, platform::GPUPlace(), ctx, dst);
+      framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst);
    };
    copyTensor(ctx, emission_exps_src, emission_exps_dst);
    copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
                         Tensor* dst) {
      if (src && dst) {
-        dst->mutable_data<T>(platform::GPUPlace());
+        dst->mutable_data<T>(platform::CUDAPlace());
-        framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst);
+        framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst);
      }
    };
    copyTensor(ctx, emission_grad_src, emission_grad_dst);

--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -11,10 +11,10 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
+#include <fstream>
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
-#include <fstream>
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    std::ifstream fin(filename);
    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase {
    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
    framework::DeserializeFromStream(fin, tensor);
-    auto place = dev_ctx.GetPlace();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    if (platform::is_gpu_place(place)) {
      // copy CPU to GPU
      framework::LoDTensor cpu_tensor;

--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                   const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
    auto &out =
        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -24,7 +24,7 @@ class LoDRankTableOp : public framework::OperatorBase {
                 const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto *out =
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();

--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                          Input("X"))
                  .Get<framework::LoDTensor>();
@@ -86,6 +87,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
        auto slice = out[i].Slice(static_cast<int>(offset),
                                  static_cast<int>(offset + len));
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
                                    static_cast<int>(each_range.end)),
                            x.place(), dev_ctx, &slice);

--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
    // LogicalOp kernel's device type is decided by input tensor place
    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
    return kt;

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),

--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      // copy GPU memory to CPU pinned memory
      framework::Vector<int64_t> new_rows;
      new_rows.resize(ids_dim[0]);
-      auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
+      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
                   ids_dim[0] * sizeof(int64_t), stream);

--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),

--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto* x_tensor = ctx.Input<framework::Tensor>("X");
    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto x_tensor = ctx.Input<Tensor>("X");
    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");

--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -159,6 +159,7 @@ void testIm2col() {
 TEST(math, im2col) {
  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>();
+  testIm2col<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
 #endif
 }
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -277,14 +277,6 @@ void set_constant_with_place<platform::CPUPlace>(
                           TensorSetConstantCPU(tensor, value));
 }
-template <>
-void set_constant_with_place<platform::MKLDNNPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
-}
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
  TensorSetConstantWithPlace(const platform::DeviceContext& context,
                             framework::Tensor* tensor, float value)

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>(
  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
                     platform::is_gpu_place(matrix_b.place()) &&
                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in GPUPlace");
+                 "Matrix must all be in CUDAPlace");
  int M = dim_out[0];
  int N = dim_out[1];
@@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>(
  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
                     platform::is_gpu_place(matrix_b.place()) &&
                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in GPUPlace");
+                 "Matrix must all be in CUDAPlace");
  int M = dim_out[0];
  int N = dim_out[1];
@@ -266,20 +266,13 @@ struct TensorSetConstantGPU {
 };
 template <>
-void set_constant_with_place<platform::GPUPlace>(
+void set_constant_with_place<platform::CUDAPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
    float value) {
  framework::VisitDataType(framework::ToDataType(tensor->type()),
                           TensorSetConstantGPU(context, tensor, value));
 }
-template <>
-void set_constant_with_place<platform::CUDNNPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  set_constant_with_place<platform::GPUPlace>(context, tensor, value);
-}
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;

--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -94,8 +94,8 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
    T* out_buf = out->mutable_data<T>(out->place());
    const T* in_buf = input.data<T>();
-    for (size_t i = 0; i < height; ++i) {
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      for (size_t j = 0; j < size; ++j) {
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
        if (i == 0) {
          out_buf[j] = in_buf[i * size + j];
        } else {

--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) {
  float arr[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr, 6 * sizeof(float));
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) {
  float arr[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr, 6 * sizeof(float));
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) {
  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) {
  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) {
  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::framework::Tensor g_mat_a;
  paddle::framework::Tensor g_vec_b;
  paddle::framework::Tensor g_vec_c;

--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -58,15 +58,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
    memory::Copy(
-        boost::get<platform::GPUPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::GPUPlace>(in1_place), in1_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
        in1_value.numel() * sizeof(T),
        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
    auto* in2_data = in2_value.data<T>();
-    memory::Copy(boost::get<platform::GPUPlace>(out_place),
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
                 out_data + in1_value.numel(),
-                 boost::get<platform::GPUPlace>(in2_place), in2_data,
+                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
                 in2_value.numel() * sizeof(T), context.stream());
  }
 };
@@ -160,9 +160,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
-    memory::Copy(boost::get<platform::GPUPlace>(in2_place),
+    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
                 in2_data + input2_offset,
-                 boost::get<platform::GPUPlace>(in1_place), in1_data,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
                 in1_value.numel() * sizeof(T), context.stream());
  }
 };

--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) {
  using namespace paddle::platform;
  using namespace paddle::operators::math;
-  GPUPlace gpu_place(0);
+  CUDAPlace gpu_place(0);
  CPUPlace cpu_place;
  CUDADeviceContext ctx(gpu_place);
  SetConstant<CUDADeviceContext, float> functor;
@@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) {
  using namespace paddle::platform;
  using namespace paddle::operators::math;
-  GPUPlace gpu_place(0);
+  CUDAPlace gpu_place(0);
  CPUPlace cpu_place;
  CUDADeviceContext ctx(gpu_place);
  SetConstant<CUDADeviceContext, float> functor;

--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -122,6 +122,6 @@ TEST(math, vol2col) {
  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
  testVol2col<paddle::platform::CUDADeviceContext,
-              paddle::platform::GPUPlace>();
+              paddle::platform::CUDAPlace>();
 #endif  // PADDLE_WITH_CUDA
 }
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto &rank_table =
        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
    auto *out =

--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                   const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -113,7 +113,7 @@ This operator is used to perform matrix multiplication for input $X$ and $Y$.
 The equation is:
-    $$Out = X * Y$$
+$$Out = X * Y$$
 Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
 or not. But the output only shares the LoD information with input $X$.

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),

--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
    auto* index = index_t_cpu.data<int32_t>();
    auto stream = ctx.cuda_device_context().stream();
-    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    for (auto i = 0; i < rows; i++) {
      int32_t k = index[i];
      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
    auto* index = index_t_cpu.data<int32_t>();
    auto stream = ctx.cuda_device_context().stream();
-    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    for (auto i = 0; i < rows; i++) {
      size_t k = static_cast<size_t>(index[i]);
      if (d_ins[k]) {

--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -24,7 +24,7 @@ class NCCLInitOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    const auto &name = Output("Communicator");
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                            "Can not find variable '%s' in the scope.", name);

--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
    auto stream = ctx.cuda_device_context().stream();
    // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(gpu_id);
    for (size_t i = 0; i < ins.size(); ++i) {
@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
                      ctx.device_context())
                      .stream();
    // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(gpu_id);
    auto ins_names = ctx.Inputs("X");
@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
                      ctx.device_context())
                      .stream();
    // device id
-    int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(gpu_id);
    if (idx == root) {

--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -22,6 +22,7 @@
 #include <vector>
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
@@ -49,9 +50,9 @@ const f::DDim kDims = {100, 100};
 class NCCLTester : public ::testing::Test {
 public:
  virtual void SetUp() override {
-    cpu_ctx = new p::CPUDeviceContext(p::CPUPlace());
+    paddle::platform::CPUPlace cpu_place;
    for (size_t i = 0; i < gpu_list.size(); ++i) {
-      p::GPUPlace place(i);
+      p::CUDAPlace place(i);
      dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
    }
@@ -65,6 +66,7 @@ class NCCLTester : public ::testing::Test {
  }
  void NCCLInitOp() {
+    paddle::platform::CPUPlace cpu_place;
    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
    op1->SetType("ncclInit");
@@ -76,7 +78,7 @@ class NCCLTester : public ::testing::Test {
    auto op = f::OpRegistry::CreateOp(*op1);
    VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, *cpu_ctx);
+    op->Run(g_scope, cpu_place);
    VLOG(1) << "NCCLInitOp finished.";
  }
@@ -85,7 +87,7 @@ class NCCLTester : public ::testing::Test {
    std::unique_lock<std::mutex> lk(mu);
    const f::OpDesc *op1 = &op_desc;
-    p::GPUPlace place(gpu_id);
+    p::CUDAPlace place(gpu_id);
    auto &ctx = dev_ctxs.at(gpu_id);
    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
@@ -111,13 +113,12 @@ class NCCLTester : public ::testing::Test {
    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
    VLOG(1) << " send_tensor : " << send_tensor->numel()
            << " recv_tensor : " << recv_tensor->numel();
-    op->Run(*scope, *ctx);
+    op->Run(*scope, place);
    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
  }
 public:
  std::vector<p::DeviceContext *> dev_ctxs;
-  p::DeviceContext *cpu_ctx;
  f::Scope g_scope;
  std::mutex mu;
 };
@@ -131,14 +132,14 @@ TEST(NCCL, ncclInitOp) {
  op_desc->SetAttr("gpus", {gpu_list});
  f::Scope g_scope;
-  std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace()));
+  paddle::platform::CPUPlace cpu_place;
  auto *var = g_scope.Var("x1");
  var->GetMutable<p::Communicator>();
  auto op = f::OpRegistry::CreateOp(*op_desc);
  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, *ctx.get());
+  op->Run(g_scope, cpu_place);
  VLOG(1) << "NCCLInitOp finished.";
 }
@@ -170,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
  for (size_t i = 0; i < dev_scopes.size(); ++i) {
    p::CPUPlace cpu_place;
-    p::GPUPlace gpu_place(gpu_list[i]);
+    p::CUDAPlace gpu_place(gpu_list[i]);
    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
    auto *rt = recv_tensor.data<float>();
@@ -179,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
    auto *ct = result_tensor->mutable_data<float>(cpu_place);
    paddle::memory::Copy(
-        cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
+        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
        recv_tensor.numel() * sizeof(float),
        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
@@ -218,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
  p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[kRoot]);
+  p::CUDAPlace gpu_place(gpu_list[kRoot]);
  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
@@ -228,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
  auto *ct = result_tensor->mutable_data<float>(cpu_place);
  paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
      recv_tensor.numel() * sizeof(float),
      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
@@ -267,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
  float result = kRoot;
  p::CPUPlace cpu_place;
-  p::GPUPlace gpu_place(gpu_list[idx]);
+  p::CUDAPlace gpu_place(gpu_list[idx]);
  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
@@ -276,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
  auto *ct = result_tensor->mutable_data<float>(cpu_place);
  paddle::memory::Copy(
-      cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
      recv_tensor.numel() * sizeof(float),
      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
@@ -294,9 +295,18 @@ int main(int argc, char **argv) {
    return 0;
  }
-  for (int i = 0; i < dev_count; ++i) {
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
    gpu_list.emplace_back(i);
  }
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
  testing::InitGoogleTest(&argc, argv);
  // device context should be release before scope.

--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase {
   * will be used.
   */
  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
    for (auto& op : ops_) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
    }
  }

--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const platform::Place& place) const override {
-           const platform::DeviceContext& dev_ctx) const override {
    ++run_cnt;
  }
 };

--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    const Tensor *input = ctx.Input<Tensor>("X");
    Tensor *output = ctx.Output<Tensor>("Out");
@@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    const Tensor *input = ctx.Input<Tensor>("X");
    const Tensor *output = ctx.Input<Tensor>("Out");

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),

--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
@@ -154,13 +154,14 @@ class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
        "Noting that reducing on the first dim will make the LoD info lost.")
        .SetDefault(0);
    AddComment(R"DOC(
-        PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) 
+PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
-        model performance. 
+performance.
-        Within some context, e.g. the "query", a LTR model generates scores
-        for a list of items, which gives a partial order of the items.
+Within some context, e.g. the "query", a LTR model generates scores for a list
-        PositiveNegativePairOp takes a list of reference rank order 
+of items, which gives a partial order of the items. PositiveNegativePairOp
-        (Input("Label")) and the model generated scores (Input(Score)) as 
+takes a list of reference rank order (Input("Label")) and the model generated
-        inputs and counts the pairs that ranked correctly and incorrectly.
+scores (Input(Score)) as inputs and counts the pairs that ranked correctly
+and incorrectly.
 )DOC");
  }
 };

--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase {
      : RecurrentBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
    VLOG(3) << "Static RNN input sequence length = " << seq_len;
    StepScopes scopes = CreateStepScopes(scope, seq_len);
    auto reverse = Attr<bool>(kReverse);
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
    for (size_t i = 0; i < seq_len; ++i) {
@@ -270,6 +271,10 @@ class RecurrentOp : public RecurrentBase {
      executor.Run(*program, &cur_scope, block->ID(),
                   false /*create_local_scope*/);
+      // get device context from pool
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
      // Copy inside::output -> outside::output
      //    outside::output[seq_offset: seq_offset + 1] = inside::output
      this->LinkTensorWithCallback(
@@ -278,14 +283,13 @@ class RecurrentOp : public RecurrentBase {
              framework::LoDTensor *dst_tensor) {
            if (i == 0) {  // create output tensor at begin
              dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
-              dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type());
+              dst_tensor->mutable_data(place, src_tensor.type());
            }
            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
            // Explicit copy output since the local RNN scope can be destroyed
            // early.
-            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
+            framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out);
-                                &dst_out);
          });
      scopes.Next();
@@ -311,15 +315,20 @@ class RecurrentGradOp : public RecurrentBase {
      : RecurrentBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
    StepScopes scopes = CreateStepScopes(scope, seq_len);
    auto reverse = Attr<bool>(kReverse);
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
      size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
      VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
@@ -366,8 +375,7 @@ class RecurrentGradOp : public RecurrentBase {
          auto *cur_grad_var = cur_scope.Var(cur_grad);
          auto cur_grad_tensor =
              cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
+          framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor);
-                              cur_grad_tensor);
        }
      }
@@ -410,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase {
            auto zero_op = framework::OpRegistry::CreateOp(
                "fill_constant", framework::VariableNameMap{},
                {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, place);
          }
          auto new_inside_name = cur_scope.Rename(inside_grad_name);
@@ -419,7 +427,7 @@ class RecurrentGradOp : public RecurrentBase {
          auto sum_op = framework::OpRegistry::CreateOp(
              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-          sum_op->Run(cur_scope, dev_ctx);
+          sum_op->Run(cur_scope, place);
          cur_scope.Rename(new_inside_name, inside_grad_name);
        }
@@ -437,11 +445,11 @@ class RecurrentGradOp : public RecurrentBase {
            }
            if (step_id == 0) {  // alloc memory
              outside->Resize(PrependDims(seq_len, inside.dims()));
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+              outside->mutable_data(place, inside.type());
            }
            auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
+            framework::CopyFrom(inside, place, dev_ctx, &dst);
          });
      VLOG(5) << "Link outside gradient finished ";
@@ -453,8 +461,8 @@ class RecurrentGradOp : public RecurrentBase {
            [&](const framework::LoDTensor &inside,
                framework::LoDTensor *outside) {
              outside->Resize(inside.dims());
-              outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+              outside->mutable_data(place, inside.type());
-              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
+              framework::CopyFrom(inside, place, dev_ctx, outside);
            });
        VLOG(5) << "Link initialize state gradient finished ";
      }

--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -73,7 +73,7 @@ class RecvOp : public framework::OperatorBase {
  }
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    // FIXME(typhoonzero): no new scopes for every run.
    framework::Scope &recv_scope = scope.NewScope();
    rpc_service_->SetScope(&recv_scope);
@@ -113,7 +113,9 @@ class RecvOp : public framework::OperatorBase {
        auto *var = recv_scope.Var(grad_var_name);
        auto *tensor = var->GetMutable<framework::LoDTensor>();
        // FIXME(typhoonzero): do not copy
-        framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor);
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+        framework::CopyFrom(v.second, place, dev_ctx, tensor);
      }
      rpc_service_->Reset();
@@ -121,7 +123,7 @@ class RecvOp : public framework::OperatorBase {
      framework::proto::ProgramDesc program_desc;
      program_desc.ParseFromString(program_str);
      framework::ProgramDesc program(program_desc);
-      framework::Executor executor(dev_ctx);
+      framework::Executor executor(place);
      // Run sub graph to get optimized tensor
      try {
        executor.Run(program, &recv_scope, 0, /*global_block*/

--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -12,9 +12,10 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include <paddle/framework/lod_rank_table.h>
+#include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -53,7 +54,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
                                  const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &x =
        detail::Ref(scope.FindVar(Input("X")),
                    "Cannot find input lod tensor variable %s", Input("X"))
@@ -69,11 +70,11 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
    out.Resize(x.dims());
    out.mutable_data(x.place(), x.type());
-    this->process(dev_ctx, x, rank_table, &out);
+    this->process(place, x, rank_table, &out);
  }
 protected:
-  virtual void process(const platform::DeviceContext &dev_ctx,
+  virtual void process(const platform::Place &place,
                       const framework::LoDTensor &x,
                       const framework::LoDRankTable &rank_table,
                       framework::LoDTensor *out) const = 0;
@@ -104,7 +105,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
    return absolute_table;
  }
-  size_t CopyTensorAndLod(const platform::DeviceContext &dev_ctx,
+  size_t CopyTensorAndLod(const platform::Place &place,
                          const AbsoluteRankTableItem &item,
                          const framework::LoDTensor &x,
                          framework::LoDTensor *out, size_t out_offset) const {
@@ -130,6 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
    auto x_sliced = x.Slice(x_offset, x_offset + len);
    auto out_sliced = out->Slice(out_offset, out_offset + len);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
    out_offset += len;
    return out_offset;
@@ -145,8 +148,7 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
 protected:
-  void process(const platform::DeviceContext &dev_ctx,
+  void process(const platform::Place &place, const framework::LoDTensor &x,
-               const framework::LoDTensor &x,
               const framework::LoDRankTable &rank_table,
               framework::LoDTensor *out) const override {
    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
@@ -154,7 +156,7 @@ class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
    out->mutable_lod()->clear();
    for (auto &item : rank_table.items()) {
      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
-      out_offset = CopyTensorAndLod(dev_ctx, absolute_table[item.index], x, out,
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
                                    out_offset);
    }
  }
@@ -192,8 +194,7 @@ class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
 protected:
-  void process(const platform::DeviceContext &dev_ctx,
+  void process(const platform::Place &place, const framework::LoDTensor &x,
-               const framework::LoDTensor &x,
               const framework::LoDRankTable &rank_table,
               framework::LoDTensor *out) const override {
    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
@@ -214,7 +215,7 @@ class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
    // Copy TensorAndLod
    size_t out_offset = 0;
    for (auto &offset : offsets) {
-      out_offset = this->CopyTensorAndLod(dev_ctx, absolute_table[offset.first],
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
                                          x, out, out_offset);
    }
  }

--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
@@ -16,7 +16,7 @@
 REGISTER_OP_CUDA_KERNEL(
    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>);
+    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
 REGISTER_OP_CUDA_KERNEL(
    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>);
+    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto mem_var_name = Input("X");
    auto *mem_var = scope.FindVar(mem_var_name);
    PADDLE_ENFORCE(mem_var != nullptr,
@@ -77,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                        const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto out_grad_var_name = Input(framework::GradVarName("Out"));
    auto *out_grad_var = scope.FindVar(out_grad_var_name);
@@ -100,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
      auto zero_op = framework::OpRegistry::CreateOp(
          "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
-      zero_op->Run(scope, dev_ctx);
+      zero_op->Run(scope, dev_place);
    } else {
      auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
      auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();

--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),

--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load);
 TEST(SaveLoadOp, CPU) {
  paddle::framework::Scope scope;
  paddle::platform::CPUPlace place;
-  paddle::platform::CPUDeviceContext ctx(place);
  auto var = scope.Var("test_var");
  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
  tensor->Resize({10, 10});
@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) {
  auto save_op = paddle::framework::OpRegistry::CreateOp(
      "save", {{"X", {"test_var"}}}, {}, attrs);
-  save_op->Run(scope, ctx);
+  save_op->Run(scope, place);
  auto load_var = scope.Var("out_var");
  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
  auto load_op = paddle::framework::OpRegistry::CreateOp(
      "load", {}, {{"Out", {"out_var"}}}, attrs);
-  load_op->Run(scope, ctx);
+  load_op->Run(scope, place);
  int* actual = target->data<int>();
  for (int64_t i = 0; i < tensor->numel(); ++i) {
    EXPECT_EQ(expect[i], actual[i]);

--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -21,6 +21,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    auto overwrite = Attr<bool>("overwrite");
@@ -88,6 +89,11 @@ class SaveOp : public framework::OperatorBase {
                   "SaveOp only support LoDTensor, %s has wrong type", iname);
    auto &tensor = var->Get<framework::LoDTensor>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    framework::SerializeToStream(fout, tensor, dev_ctx);
  }
 };

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),

--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -49,7 +49,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
        .AsIntermediate();
    AddAttr<std::string>(
        "pooltype",
-        "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
+        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
        .SetDefault("AVERAGE")
        .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
    AddComment(R"DOC(
@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),

--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),

--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp {
      : ArrayOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto *x_var = scope.FindVar(Input("X"));
    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
    auto &x_tensor = x_var->Get<framework::LoDTensor>();
-    size_t offset = this->GetOffset(scope, dev_ctx);
+    size_t offset = this->GetOffset(scope, place);
    auto *rank_table_var = scope.FindVar(Input("RankTable"));
    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
@@ -93,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
      : ArrayOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
@@ -105,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
    dx_tensor.Resize(x_tensor.dims());
    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
    if (dout_var == nullptr) {  // dx_tensor fill zero
      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
    } else {

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                   const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
    auto *out_true =
@@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
    auto &x_lod = x.lod();
    auto &mask_dim = mask.dims();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
    if (platform::is_cpu_place(mask.place())) {
      cpu_mask->ShareDataWith(mask);

--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -82,7 +82,7 @@ TEST(StridedMemcpy, GPUCrop) {
  };
  // clang-format on
-  platform::GPUPlace gpu0(0);
+  platform::CUDAPlace gpu0(0);
  platform::CPUPlace cpu;
  platform::CUDADeviceContext ctx(gpu0);
@@ -121,7 +121,7 @@ TEST(StridedMemcpy, GPUConcat) {
  };
  // clang-format on
-  platform::GPUPlace gpu0(0);
+  platform::CUDAPlace gpu0(0);
  platform::CPUPlace cpu;
  platform::CUDADeviceContext ctx(gpu0);

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    auto x_vars = ctx.MultiInputVar("X");
    if (x_vars[0]->IsType<framework::LoDTensor>()) {

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp {
      : ArrayOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    if (x == nullptr) return;
    auto &x_tensor = x->Get<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
    auto *out =
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
    if (offset >= out->size()) {
@@ -39,7 +39,11 @@ class WriteToArrayOp : public ArrayOp {
    }
    if (x_tensor.memory_size() > 0) {
      auto *out_tensor = &out->at(offset);
-      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_tensor.lod());
    } else {
      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
@@ -119,17 +123,18 @@ class ReadFromArrayOp : public ArrayOp {
                  const framework::AttributeMap &attrs)
      : ArrayOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    PADDLE_ENFORCE(x != nullptr, "X must be set");
    auto &x_array = x->Get<framework::LoDTensorArray>();
    auto *out = scope.FindVar(Output("Out"));
    PADDLE_ENFORCE(out != nullptr, "Out must be set");
    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-    size_t offset = GetOffset(scope, dev_ctx);
+    size_t offset = GetOffset(scope, place);
    if (offset < x_array.size()) {
-      framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-                          out_tensor);
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_array[offset].lod());
    } else {
      VLOG(10) << "offset " << offset << " >= " << x_array.size();

--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("X");
    auto* output = ctx.Output<Tensor>("Out");
    auto* indices = ctx.Output<Tensor>("Indices");

--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -70,18 +70,19 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 Transpose Operator.
 The input tensor will be permuted according to the axis values given.
-The op functions similar to how numpy.transpose works in python.
+The op functions is similar to how numpy.transpose works in python.
-For example:
- >> input = numpy.arange(6).reshape((2,3))
+For example: input = numpy.arange(6).reshape((2,3))
- >> input
+the input is:
- array([[0, 1, 2],
+array([[0, 1, 2],
-        [3, 4, 5]])
+      [3, 4, 5]])
- >> axis = [1, 0]
+given axis is: [1, 0]
- >> output = input.transpose(axis)
- >> output
+output = input.transpose(axis)
- array([[0, 3],
+then the output is:
-        [1, 4],
+array([[0, 3],
-		[2, 5]])
+       [1, 4],
+       [2, 5]])
 So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
 the output tensor shape will be (N, H, W, C)

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  }
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),

--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -53,16 +53,14 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
        "(string), unpooling type, can be \"max\" for max-unpooling ")
        .InEnum({"max"});
    AddComment(R"DOC(
-        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
-        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+$(N, C_{out}, H_{out}, W_{out})$, where
-        Where
+$$
-          $$
+H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
-            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
-            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+$$
-          $$
+Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
-        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+)DOC");
-        /07/iccv2011.pdf
-        )DOC");
  }
 };
@@ -73,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 class UnpoolOp : public framework::OperatorWithKernel {
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -112,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
 class UnpoolOpGrad : public framework::OperatorWithKernel {
 protected:
-  framework::OpKernelType GetKernelType(
+  framework::OpKernelType GetActualKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase {
      : framework::OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
    auto step_scopes =
@@ -97,8 +98,8 @@ class WhileGradOp : public framework::OperatorBase {
      : framework::OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
-    framework::Executor executor(dev_ctx);
+    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
@@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase {
            auto zero_op = framework::OpRegistry::CreateOp(
                "fill_constant", framework::VariableNameMap{},
                {{"Out", {pg_names[param_id]}}}, attrs);
-            zero_op->Run(scope, dev_ctx);
+            zero_op->Run(scope, dev_place);
          }
        }
@@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase {
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
-        sum_op->Run(cur_scope, dev_ctx);
+        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }
    }

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -25,7 +25,7 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
+nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)

--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
@@ -22,23 +22,7 @@ namespace paddle {
 namespace platform {
 void CudaProfilerInit(std::string output_file, std::string output_mode,
-                      std::vector<std::string> config_flags) {
+                      std::string config_file) {
-  std::array<char, 128> buf;
-  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
-  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
-  memcpy(buf.data(), tmpl.data(), tmpl.size());
-  auto result = mktemp(buf.data());
-  PADDLE_ENFORCE(strlen(result) != 0);
-  std::string config_file = result;
-  {
-    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
-    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
-    for (const auto& line : config_flags) {
-      ofs << line << std::endl;
-    }
-  }
  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
  PADDLE_ENFORCE(

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,6 +15,59 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+const platform::DeviceContext* DeviceContextPool::Borrow(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
+  std::vector<const platform::DeviceContext*> borrowed_contexts;
+  for (auto& place : places) {
+    auto it = device_contexts_.find(place);
+    if (it != device_contexts_.end()) {
+      borrowed_contexts.emplace_back(it->second);
+    } else {
+      PADDLE_THROW(
+          "'Place' is not supported, Please re-compile with WITH_GPU "
+          "option");
+    }
+  }
+  return borrowed_contexts;
+}
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::CUDAPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
 CPUDeviceContext::CPUDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -38,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  }
  ~EigenCudaStreamDevice() override {}
-  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
+  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
    stream_ = cuda_stream;
    place_ = place;
    device_prop_ = &Eigen::m_deviceProperties[place.device];
@@ -77,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  }
 private:
-  GPUPlace place_;
+  CUDAPlace place_;
  const cudaStream_t* stream_;         // not owned;
  const cudaDeviceProp* device_prop_;  // not owned;
  mutable void* scratch_;
  mutable unsigned int* semaphore_;
 };
-CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  SetDeviceId(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
  eigen_stream_.reset(new EigenCudaStreamDevice());
@@ -125,20 +178,18 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
-CUDNNDeviceContext::CUDNNDeviceContext(CUDNNPlace place)
+CUDNNDeviceContext::CUDNNDeviceContext(CUDAPlace place)
-    : CUDADeviceContext(place), place_(place) {
+    : CUDADeviceContext(place) {
  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
 }
 CUDNNDeviceContext::~CUDNNDeviceContext() {
-  SetDeviceId(place_.device);
+  SetDeviceId(boost::get<CUDAPlace>(GetPlace()).device);
  Wait();
  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
 }
-Place CUDNNDeviceContext::GetPlace() const { return CUDNNPlace(); }
 cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; }
 #endif

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -11,8 +11,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/platform/enforce.h"
+#include <memory>
-#include "paddle/platform/place.h"
+#include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
@@ -20,10 +20,13 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <memory>
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "glog/logging.h"
 namespace paddle {
 namespace platform {
@@ -55,7 +58,7 @@ class EigenCudaStreamDevice;
 class CUDADeviceContext : public DeviceContext {
 public:
-  explicit CUDADeviceContext(GPUPlace place);
+  explicit CUDADeviceContext(CUDAPlace place);
  virtual ~CUDADeviceContext();
  /*! \brief  Wait for all operations completion in the stream. */
@@ -77,7 +80,7 @@ class CUDADeviceContext : public DeviceContext {
  cudaStream_t stream() const;
 private:
-  GPUPlace place_;
+  CUDAPlace place_;
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
@@ -89,21 +92,63 @@ class CUDADeviceContext : public DeviceContext {
 class CUDNNDeviceContext : public CUDADeviceContext {
 public:
-  explicit CUDNNDeviceContext(CUDNNPlace place);
+  explicit CUDNNDeviceContext(CUDAPlace place);
  virtual ~CUDNNDeviceContext();
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const final;
  /*! \brief  Return cudnn  handle in the device context. */
  cudnnHandle_t cudnn_handle() const;
 private:
  cudnnHandle_t cudnn_handle_;
-  CUDNNPlace place_;
 };
 #endif
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+  static DeviceContextPool& Get() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Borrow(const platform::Place& place);
+  /*! \brief  Return handle of multi-device context. */
+  std::vector<const platform::DeviceContext*> Borrow(
+      const std::vector<platform::Place>& places);
+  ~DeviceContextPool() {}
+ private:
+  static DeviceContextPool* pool;
+  constexpr static int LEFT_SHIFT = 8;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which() + (1 << LEFT_SHIFT);
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+#include "glog/logging.h"
 TEST(Device, Init) {
  using paddle::platform::DeviceContext;
  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
  int count = paddle::platform::GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
    delete device_context;
@@ -31,11 +33,11 @@ TEST(Device, Init) {
 TEST(Device, CUDADeviceContext) {
  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::GPUPlace;
+  using paddle::platform::CUDAPlace;
  int count = paddle::platform::GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
@@ -49,12 +51,11 @@ TEST(Device, CUDADeviceContext) {
 TEST(Device, CUDNNDeviceContext) {
  using paddle::platform::CUDNNDeviceContext;
-  using paddle::platform::CUDNNPlace;
+  using paddle::platform::CUDAPlace;
  if (paddle::platform::dynload::HasCUDNN()) {
    int count = paddle::platform::GetCUDADeviceCount();
    for (int i = 0; i < count; ++i) {
-      CUDNNDeviceContext* device_context =
+      CUDNNDeviceContext* device_context = new CUDNNDeviceContext(CUDAPlace(i));
-          new CUDNNDeviceContext(CUDNNPlace(i));
      cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
      ASSERT_NE(nullptr, cudnn_handle);
      ASSERT_NE(nullptr, device_context->stream());
@@ -62,3 +63,54 @@ TEST(Device, CUDNNDeviceContext) {
    }
  }
 }
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::CUDAPlace;
+  DeviceContextPool& pool = DeviceContextPool::Get();
+  auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
+  EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    gpu_places.emplace_back(CUDAPlace(i));
+  }
+  auto dev_ctxs = pool.Borrow(gpu_places);
+  for (size_t i = 0; i < dev_ctxs.size(); ++i) {
+    auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
+    // check same as CUDAPlace(i)
+    CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
+    EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
+  }
+}
+int main(int argc, char** argv) {
+  int dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
+                    "device count is "
+                 << dev_count;
+    return 0;
+  }
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -63,6 +63,8 @@ extern void LoadNCCLDSO();
  __macro(ncclAllReduce);               \
  __macro(ncclBcast);                   \
  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
  __macro(ncclReduce);                  \
  __macro(ncclGetErrorString);

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
+#include "paddle/platform/macros.h"
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"

--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -12,17 +12,19 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/init.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/dynload/nccl.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
 static int dev_count = 0;
 namespace paddle {
@@ -31,7 +33,8 @@ namespace platform {
 TEST(NCCL, init) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
-  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
  for (int i = 0; i < dev_count; ++i) {
    dynload::ncclCommDestroy(comms[i]);
  }
@@ -47,7 +50,7 @@ struct PerThreadData {
  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
-  PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) {
+  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
    send_buff.resize(size);
    for (size_t i = 0; i < size; ++i) {
      send_buff[i] = static_cast<T>(i);
@@ -131,6 +134,18 @@ int main(int argc, char** argv) {
        << dev_count;
    return 0;
  }
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Create(places);
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -23,8 +23,9 @@ class PlacePrinter : public boost::static_visitor<> {
 public:
  explicit PlacePrinter(std::ostream &os) : os_(os) {}
  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
-  void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; }
+  void operator()(const CUDAPlace &p) {
-  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
+    os_ << "CUDAPlace(" << p.device << ")";
+  }
 private:
  std::ostream &os_;
@@ -37,20 +38,14 @@ static Place the_default_place;
 void set_place(const Place &place) { the_default_place = place; }
 const Place &get_place() { return the_default_place; }
-const GPUPlace default_gpu() { return GPUPlace(0); }
+const CUDAPlace default_gpu() { return CUDAPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
-const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); }
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsGPUPlace(), p);
+  return boost::apply_visitor(IsCUDAPlace(), p);
-}
-bool is_cpu_place(const Place &p) {
-  return !is_gpu_place(p) && !is_mkldnn_place(p);
 }
-bool is_mkldnn_place(const Place &p) {
+bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
-  return boost::apply_visitor(IsMKLDNNPlace(), p);
-}
 bool places_are_same_class(const Place &p1, const Place &p2) {
  return p1.which() == p2.which();

--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -31,65 +31,35 @@ struct CPUPlace {
  inline bool operator!=(const CPUPlace &) const { return false; }
 };
-struct MKLDNNPlace {
+struct CUDAPlace {
-  MKLDNNPlace() {}
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
-  // needed for variant equality comparison
-  inline bool operator==(const MKLDNNPlace &) const { return true; }
-  inline bool operator!=(const MKLDNNPlace &) const { return false; }
-};
-struct GPUPlace {
-  GPUPlace() : GPUPlace(0) {}
-  explicit GPUPlace(int d) : device(d) {}
  inline int GetDeviceId() const { return device; }
  // needed for variant equality comparison
-  inline bool operator==(const GPUPlace &o) const { return device == o.device; }
+  inline bool operator==(const CUDAPlace &o) const {
-  inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
  int device;
 };
-struct CUDNNPlace : public GPUPlace {
+struct IsCUDAPlace : public boost::static_visitor<bool> {
-  CUDNNPlace() : GPUPlace() {}
-  explicit CUDNNPlace(int d) : GPUPlace(d) {}
-};
-struct IsGPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const MKLDNNPlace &) const { return false; }
+  bool operator()(const CUDAPlace &gpu) const { return true; }
-  bool operator()(const GPUPlace &gpu) const { return true; }
 };
-struct IsMKLDNNPlace : public boost::static_visitor<bool> {
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
-  bool operator()(const MKLDNNPlace &) const { return true; }
-  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const GPUPlace &) const { return false; }
-};
-// Define the max number of Place in bit length. i.e., the max number of places
-// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
-#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
-typedef boost::variant<CUDNNPlace, GPUPlace, CPUPlace, MKLDNNPlace> Place;
-// static check number of place types is less equal than
-// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
-BOOST_MPL_ASSERT((boost::mpl::less_equal<
-                  Place::types::size,
-                  boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
 void set_place(const Place &);
 const Place &get_place();
-const GPUPlace default_gpu();
+const CUDAPlace default_gpu();
 const CPUPlace default_cpu();
-const MKLDNNPlace default_mkldnn();
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
-bool is_mkldnn_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 std::ostream &operator<<(std::ostream &, const Place &);

--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -4,45 +4,34 @@
 TEST(Place, Equality) {
  paddle::platform::CPUPlace cpu;
-  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
-  paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0);
  EXPECT_EQ(cpu, cpu);
  EXPECT_EQ(g0, g0);
  EXPECT_EQ(g1, g1);
  EXPECT_EQ(g0, gg0);
-  EXPECT_EQ(d0, dd0);
  EXPECT_NE(g0, g1);
-  EXPECT_NE(d0, d1);
  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
-  EXPECT_TRUE(paddle::platform::is_gpu_place(d0));
-  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, d0));
 }
 TEST(Place, Default) {
  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
-  EXPECT_TRUE(
-      paddle::platform::is_mkldnn_place(paddle::platform::default_mkldnn()));
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
  paddle::platform::set_place(paddle::platform::CPUPlace());
  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
-  paddle::platform::set_place(paddle::platform::MKLDNNPlace());
-  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
-  EXPECT_TRUE(paddle::platform::is_mkldnn_place(paddle::platform::get_place()));
 }
 TEST(Place, Print) {
  {
    std::stringstream ss;
-    ss << paddle::platform::GPUPlace(1);
+    ss << paddle::platform::CUDAPlace(1);
-    EXPECT_EQ("GPUPlace(1)", ss.str());
+    EXPECT_EQ("CUDAPlace(1)", ss.str());
  }
  {
    std::stringstream ss;

--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -49,7 +49,7 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
  using namespace paddle::platform;
  using namespace paddle::memory;
-  GPUPlace gpu0(0);
+  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
@@ -80,7 +80,7 @@ TEST(Transform, GPUBinary) {
  using namespace paddle::platform;
  using namespace paddle::memory;
  int buf[4] = {1, 2, 3, 4};
-  GPUPlace gpu0(0);
+  CUDAPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());

--- a/paddle/pybind/const_value.cc
+++ b/paddle/pybind/const_value.cc
@@ -23,6 +23,11 @@ void BindConstValue(pybind11::module& m) {
  m.def("kTempVarName", [] { return framework::kTempVarName; });
  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+  // for kernel_hint key
+  m.def("kUseCPU", [] { return framework::kUseCPU; });
+  m.def("kUseCUDNN", [] { return framework::kUseCUDNN; });
+  m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; });
 }
 }  // namespace pybind

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) {
             self.Resize(make_ddim(dim));
           })
      .def("alloc_float",
-           [](Tensor &self, paddle::platform::GPUPlace &place) {
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
             self.mutable_data<float>(place);
           })
      .def("alloc_float",
@@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) {
             self.mutable_data<int>(place);
           })
      .def("alloc_int",
-           [](Tensor &self, paddle::platform::GPUPlace &place) {
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
             self.mutable_data<int>(place);
           })
      .def("set", PyCPUTensorSetFromArray<float>)
@@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle.
                    return new paddle::platform::CPUDeviceContext();
                  })
      .def_static("create",
-                  [](paddle::platform::GPUPlace& place)
+                  [](paddle::platform::CUDAPlace& place)
                      -> paddle::platform::DeviceContext* {
 #ifndef PADDLE_WITH_CUDA
-                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
 #else
                    return new paddle::platform::CUDADeviceContext(place);
 #endif
@@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CUDA
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
-  py::class_<platform::GPUPlace>(m, "GPUPlace")
+  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
      .def(py::init<int>())
-      .def("__str__", string::to_string<const platform::GPUPlace &>);
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
      .def(py::init<>())
@@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle.
             self = cpu_place;
           })
      .def("set_place",
-           [](platform::Place &self, const platform::GPUPlace &gpu_place) {
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
             self = gpu_place;
           });
@@ -360,10 +360,10 @@ All parameter, weight, gradient are variables in Paddle.
           })
      .def("run",
           [](OperatorBase &self, const Scope &scope,
-              const platform::DeviceContext &dev_ctx) {
+              const platform::CPUPlace &place) { self.Run(scope, place); })
-             self.Run(scope, dev_ctx);
+      .def("run",
-             dev_ctx.Wait();
+           [](OperatorBase &self, const Scope &scope,
-           })
+              const platform::CUDAPlace &place) { self.Run(scope, place); })
      .def("type",
           [](const OperatorBase &op) -> std::string { return op.Type(); })
      .def("outputs",
@@ -417,7 +417,7 @@ All parameter, weight, gradient are variables in Paddle.
           });
  py::class_<framework::Executor>(m, "Executor")
-      .def(py::init<std::vector<platform::Place> &>())
+      .def(py::init<const platform::Place &>())
      .def("run", &Executor::Run);
  m.def("unique_integer", UniqueIntegerGenerator);

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -14,9 +14,9 @@
 #pragma once
 #include <string>
-#include "paddle/framework/executor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
-        framework::DeviceContextPool &pool =
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
-            framework::DeviceContextPool::Get();
        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
            pool.Borrow(tensor.place()));
@@ -72,7 +71,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
            cudaMemcpyDeviceToHost, dev_ctx->stream());
 #else
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
        dst_tensor = tensor;
@@ -128,7 +127,7 @@ template <typename T>
 void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::GPUPlace &place) {
+    paddle::platform::CUDAPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
@@ -138,7 +137,7 @@ void PyCUDATensorSetFromArray(
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
-  framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
  auto dev_ctx =
      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,7 +6,6 @@ if(WITH_TESTING)
  add_library(paddle_test_util STATIC TestUtil.cpp)
  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
  if(NOT MOBILE_INFERENCE)
-    add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
-    add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
  endif()
 endif()
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cstring>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/init.h"
 #include "paddle/memory/memory.h"
 int main(int argc, char** argv) {
@@ -32,8 +34,11 @@ int main(int argc, char** argv) {
  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
  testing::InitGoogleTest(&argc, argv);
  paddle::memory::Used(paddle::platform::CPUPlace());
+  std::vector<std::string> devs = {"CPU"};
 #ifdef PADDLE_WITH_CUDA
-  paddle::memory::Used(paddle::platform::GPUPlace(0));
+  paddle::memory::Used(paddle::platform::CUDAPlace(0));
+  devs.push_back("GPU:0");
 #endif
+  paddle::framework::InitDevices(devs);
  return RUN_ALL_TESTS();
 }
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -270,7 +270,7 @@ class LayerType(object):
    @staticmethod
    def is_layer_type(type_name):
        """
-        If type_name is a layer type.
+        Whether type_name is a layer type.
        :param type_name: layer type name. Because layer type enumerations are
                          strings.
@@ -441,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None):
       with mixed_layer(size=100) as m:
           m += full_matrix_projection(input=layer)
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
    .. code-block:: python
@@ -451,11 +451,11 @@ def full_matrix_projection(input, size=0, param_attr=None):
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of this layer.
    :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
-    :return: A FullMatrixProjection Object.
+    :return: FullMatrixProjection Object.
    :rtype: FullMatrixProjection
    """
    proj = FullMatrixProjection(
@@ -468,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None):
 def trans_full_matrix_projection(input, size=0, param_attr=None):
    """
    Different from full_matrix_projection, this projection performs matrix
-    multiplication, using transpose of weight.
+    multiplication, using the transpose of weight.
    ..  math::
        out.row[i] += in.row[i] * w^\mathrm{T}
-    :math:`w^\mathrm{T}` means transpose of weight.
+    :math:`w^\mathrm{T}` means the transpose of weight.
    The simply usage is:
    .. code-block:: python
@@ -489,9 +489,9 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
    :type input: LayerOutput
    :param size: The parameter size. Means the width of parameter.
    :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
-    :return: A TransposedFullMatrixProjection Object.
+    :return: TransposedFullMatrixProjection Object.
    :rtype: TransposedFullMatrixProjection
    """
    proj = TransposedFullMatrixProjection(
@@ -521,7 +521,7 @@ def table_projection(input, size=0, param_attr=None):
       with mixed_layer(size=100) as m:
           m += table_projection(input=layer)
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
    .. code-block:: python
@@ -532,11 +532,11 @@ def table_projection(input, size=0, param_attr=None):
    :param input: The input of this layer, which must contains id fields.
    :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of the output.
    :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
-    :return: A TableProjection Object.
+    :return: TableProjection Object.
    :rtype: TableProjection
    """
    proj = TableProjection(
@@ -547,7 +547,7 @@ def table_projection(input, size=0, param_attr=None):
 def identity_projection(input, offset=None, size=None):
    """
-    1. IdentityProjection if offset=None. It performs:
+    1. If offset=None, it performs IdentityProjection as follows:
    .. math::
       out.row[i] += in.row[i]
@@ -559,9 +559,8 @@ def identity_projection(input, offset=None, size=None):
       proj = identity_projection(input=layer)
-    2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection,
+    2. If offset!=None, It executes IdentityOffsetProjection and takes the
-    but layer size may be smaller than input size.
+       elements of the input in the range [offset, offset+size) as output.
-    It select dimesions [offset, offset+layer_size) from input:
    .. math::
       out.row[i] += in.row[i + \\textrm{offset}]
@@ -573,14 +572,20 @@ def identity_projection(input, offset=None, size=None):
       proj = identity_projection(input=layer,
                                  offset=10)
-    Note that both of two projections should not have any parameter.
+    Note that neither of the projections have trainable parameter.
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param offset: Offset, None if use default.
+    :param offset: The offset from the start of the input. The input's
+                   elements in the range [offset, offset+size) will be
+                   taken as output. If this parameter is not set or set
+                   to None, the output will be the same as the input.
    :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection object
+    :param size: The dimension of this layer. It will be neglected
-    :rtype: IdentityProjection or IdentityOffsetProjection
+                 when offset is None or not set.
+    :type size: int
+    :return: IdentityProjection or IdentityOffsetProjection object
+    :rtype: IdentityProjection | IdentityOffsetProjection
    """
    if offset is None:
        proj = IdentityProjection(input_layer_name=input.name)
@@ -596,8 +601,8 @@ def identity_projection(input, offset=None, size=None):
 def slice_projection(input, slices):
    """
-    slice_projection can slice the input value into multiple parts,
+    slice_projection slices the input value into multiple parts,
-    and then select some of them to merge into a new output.
+    then selects and merges some of them into a new output.
    .. math::
       output = [input.slices()]
@@ -608,15 +613,13 @@ def slice_projection(input, slices):
       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
-    Note that slice_projection should not have any parameter.
+    Note that slice_projection has no trainable parameter.
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param slices: An array of slice parameters.
+    :param slices: A list of start and end offsets of each slice.
-                   Each slice contains the start and end offsets based
+    :type slices: list of tuple
-                   on the input.
+    :return: SliceProjection object.
-    :type slices: pair of int
-    :return: A SliceProjection object
    :rtype: SliceProjection
    """
    assert len(slices) >= 1
@@ -636,8 +639,7 @@ def slice_projection(input, slices):
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
    """
-    scaling_projection multiplies the input with a scalar parameter and add to
+    scaling_projection multiplies the input with a scalar parameter.
-    the output.
    .. math::
       out += w * in
@@ -650,9 +652,9 @@ def scaling_projection(input, param_attr=None):
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
-    :return: A ScalingProjection object
+    :return: ScalingProjection object.
    :rtype: ScalingProjection
    """
    proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
@@ -663,8 +665,8 @@ def scaling_projection(input, param_attr=None):
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
    """
-    DotMulProjection with a layer as input.
+    DotMulProjection takes a layer as input and performs
-    It performs element-wise multiplication with weight.
+    element-wise multiplication with weight.
    ..  math::
        out.row[i] += in.row[i] .* weight
@@ -679,9 +681,9 @@ def dotmul_projection(input, param_attr=None):
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
-    :return: A DotMulProjection Object.
+    :return: DotMulProjection object.
    :rtype: DotMulProjection
    """
    proj = DotMulProjection(
@@ -698,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
       out.row[i] += scale * (a.row[i] .* b.row[i])
    where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is one.
+    scale is a config scalar, its default value is 1.
    The example usage is:
@@ -706,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
       op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
-    :param a: Input layer1
+    :param a: The first input of this layer.
    :type a: LayerOutput
-    :param b: Input layer2
+    :param b: The second input of this layer.
    :type b: LayerOutput
-    :param scale: config scalar, default value is one.
+    :param scale: A scalar to scale the product. Its default value is 1.
    :type scale: float
-    :return: A DotMulOperator Object.
+    :return: DotMulOperator object.
    :rtype: DotMulOperator
    """
    if 'x' in kwargs or 'y' in kwargs:
@@ -738,28 +740,29 @@ def context_projection(input,
    """
    Context Projection.
-    It just simply reorganizes input sequence, combines "context_len" sequence
+    It just reorganizes input sequence, combines "context_len" elements of the
-    to one context from context_start. "context_start" will be set to
+    sequence to one context from context_start. "context_start" will be set to
-    -(context_len - 1) / 2 by default. If context position out of sequence
+    -(context_len - 1) / 2 by default. When context position is out of sequence
    length, padding will be filled as zero if padding_attr = False, otherwise
    it is trainable.
-    For example, origin sequence is [A B C D E F G], context len is 3, then
+    For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
-    after context projection and not set padding_attr, sequence will
+    is not set, then after context projection, sequence will
    be [ 0AB ABC BCD CDE DEF EFG FG0 ].
    :param input: The input of this layer, which should be a sequence.
    :type input: LayerOutput
-    :param context_len: context length.
+    :param context_len: The length of the context.
    :type context_len: int
-    :param context_start: context start position. Default is
+    :param context_start: The start position of the context. The default value is
                          -(context_len - 1)/2
    :type context_start: int
-    :param padding_attr: Padding Parameter Attribute. If false, it means padding
+    :param padding_attr: Parameter attribute of the padding. If the parameter is
-                         always be zero. Otherwise Padding is learnable, and
+                         set to False, padding will be zero. In other cases, the
-                         parameter attribute is set by this parameter.
+                         padding is trainable, and its parameter attribute is set
+                         by this parameter.
    :type padding_attr: bool | ParameterAttribute
-    :return: Projection
+    :return: Projection object.
    :rtype: Projection
    """
    context_start = -(
@@ -791,10 +794,9 @@ class MixedLayerType(LayerOutput):
    def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
        """
-        Ctor.
+        :param name: The name of this layer.
-        :param name: layer name.
        :type name: basestring
-        :param size: layer size.
+        :param size: The dimension of this layer.
        :type size: int
        :param act: Activation type.
        :type act: BaseActivation
@@ -802,8 +804,9 @@ class MixedLayerType(LayerOutput):
                          whose type is not ParameterAttribute, no bias is defined. If the
                          parameter is set to True, the bias is initialized to zero.
        :type bias_attr: ParameterAttribute | None | bool | Any
-        :param layer_attr: Extra Layer Attribute.
+        :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-        :type layer_attr: ExtraLayerAttribute or None
+                           details.
+        :type layer_attr: ExtraLayerAttribute | None
        """
        LayerOutput.__init__(
            self,
@@ -868,12 +871,12 @@ def mixed_layer(size=0,
                bias_attr=False,
                layer_attr=None):
    """
-    Mixed Layer. A mixed layer will add all inputs together, then activate.
+    Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
-    Each inputs is a projection or operator.
+    Each input is a projection or operator.
    There are two styles of usages.
-    1. When not set inputs parameter, use mixed_layer like this:
+    1. When the parameter input is not set, use mixed_layer like this:
    .. code-block:: python
@@ -889,21 +892,21 @@ def mixed_layer(size=0,
                       input=[full_matrix_projection(input=layer1),
                              full_matrix_projection(input=layer2)])
-    :param name: mixed layer name. Can be referenced by other layer.
+    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param size: layer size.
+    :param size: The dimension of this layer.
    :type size: int
-    :param input: The input of this layer. It is an optional parameter. If set,
+    :param input: The input of this layer. It is an optional parameter.
-                  then this function will just return layer's name.
    :param act: Activation Type. LinearActivation is the default activation.
    :type act: BaseActivation
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
                      parameter is set to True, the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer config. Default is None.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
-    :return: MixedLayerType object can add inputs or layer name.
+    :return: MixedLayerType object.
    :rtype: MixedLayerType
    """
@@ -938,14 +941,15 @@ def data_layer(name, size, depth=None, height=None, width=None,
    :param name: The name of this layer.
    :type name: basestring
-    :param size: Size of this data layer.
+    :param size: The dimension of this data layer.
    :type size: int
-    :param height: Height of this data layer, used for image
+    :param height: The height of the input image data.
    :type height: int | None
-    :param width: Width of this data layer, used for image
+    :param width: The width of the input image data.
    :type width: int | None
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-    :type layer_attr: ExtraLayerAttribute.
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -978,14 +982,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input: The input of this layer, which must be Index Data.
+    :param input: The input of this layer, whose type must be Index Data.
    :type input: LayerOutput
-    :param size: The embedding dimension.
+    :param size: The dimension of the embedding vector.
    :type size: int
    :param param_attr: The embedding parameter attribute. See ParameterAttribute
                      for details.
-    :type param_attr: ParameterAttribute | None
+    :type param_attr: ParameterAttribute
-    :param layer_attr: Extra layer Config. Default is None.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute | None
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -1013,7 +1018,7 @@ def fc_layer(input,
             bias_attr=None,
             layer_attr=None):
    """
-    Helper for declare fully connected layer.
+    The fully connected layer.
    The example usage is:
@@ -1035,17 +1040,18 @@ def fc_layer(input,
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput | list | tuple
-    :param size: The layer dimension.
+    :param size: The dimension of this layer.
    :type size: int
    :param act: Activation Type. TanhActivation is the default activation.
    :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
                      parameter is set to True, the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute | None
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -1086,13 +1092,15 @@ def fc_layer(input,
 @wrap_name_default("print")
 def printer_layer(input, format=None, name=None):
    """
-    Print the output value of input layers. This layer is useful for debugging.
+    Print the output value of the layers specified by the parameter input.
+    This layer is useful for debugging.
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput | list | tuple
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    if isinstance(input, LayerOutput):
        input = [input]
@@ -1135,11 +1143,12 @@ def priorbox_layer(input,
    :param aspect_ratio: The aspect ratio.
    :type aspect_ratio: list
    :param variance: The bounding box variance.
-    :type min_size: The min size of the priorbox width/height.
+    :type min_size: The minimum size of the priorbox width/height.
    :param min_size: list
-    :type max_size: The max size of the priorbox width/height. Could be NULL.
+    :type max_size: The maximum size of the priorbox width/height. It could be NULL.
    :param max_size: list
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    # plus one for ratio 1.
    num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
@@ -1177,7 +1186,7 @@ def multibox_loss_layer(input_loc,
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input_loc: The input predict locations.
+    :param input_loc: The input predicted locations.
    :type input_loc: LayerOutput | List of LayerOutput
    :param input_conf: The input priorbox confidence.
    :type input_conf: LayerOutput | List of LayerOutput
@@ -1189,13 +1198,15 @@ def multibox_loss_layer(input_loc,
    :type num_classes: int
    :param overlap_threshold: The threshold of the overlap.
    :type overlap_threshold: float
-    :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
+    :param neg_pos_ratio: The ratio of the negative bounding box to
+                          the positive bounding box.
    :type neg_pos_ratio: float
-    :param neg_overlap: The negative bbox overlap threshold.
+    :param neg_overlap: The negative bounding box overlap threshold.
    :type neg_overlap: float
    :param background_id: The background class index.
    :type background_id: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    if isinstance(input_loc, LayerOutput):
        input_loc = [input_loc]
@@ -1258,19 +1269,20 @@ def detection_output_layer(input_loc,
    :type input_conf: LayerOutput | List of LayerOutput.
    :param priorbox: The input priorbox location and the variance.
    :type priorbox: LayerOutput
-    :param num_classes: The number of the classification.
+    :param num_classes: The number of the classes.
    :type num_classes: int
    :param nms_threshold: The Non-maximum suppression threshold.
    :type nms_threshold: float
-    :param nms_top_k: The bbox number kept of the NMS's output
+    :param nms_top_k: The bounding boxes number kept of the NMS's output.
    :type nms_top_k: int
-    :param keep_top_k: The bbox number kept of the layer's output
+    :param keep_top_k: The bounding boxes number kept of the layer's output.
    :type keep_top_k: int
-    :param confidence_threshold: The classification confidence threshold
+    :param confidence_threshold: The classification confidence threshold.
    :type confidence_threshold: float
    :param background_id: The background class index.
    :type background_id: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    if isinstance(input_loc, LayerOutput):
        input_loc = [input_loc]
@@ -1326,7 +1338,7 @@ def roi_pool_layer(input,
    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
    feature map.
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param input: The input layer.
    :type input: LayerOutput.
@@ -1338,9 +1350,10 @@ def roi_pool_layer(input,
    :type pooled_height: int
    :param spatial_scale: The spatial scale between the image and feature map.
    :type spatial_scale: float
-    :param num_channels: number of input channel.
+    :param num_channels: The number of the input channels.
    :type num_channels: int
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    if num_channels is None:
        assert input.num_filters is not None
@@ -1361,18 +1374,19 @@ def roi_pool_layer(input,
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
    """
-    Normalize a layer's output. This layer is necessary for ssd.
+    Normalize a layer's output. This layer is necessary for ssd. This
-    This layer applys normalize across the channels of each sample to
+    layer applys normalization across the channels of each sample to
-    a conv layer's output and scale the output by a group of trainable
+    a convolutional layer's output and scales the output by a group of
-    factors which dimensions equal to the channel's number.
+    trainable factors whose dimensions equal to the channel's number.
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    assert input.num_filters is not None
    Layer(
@@ -1413,12 +1427,9 @@ def pooling_layer(input,
    Pooling layer for sequence inputs, not used for Image.
    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the pooling value of the window as the output. Thus, a long sequence
+    and returns the pooling value of the sequence in the window as the output. Thus,
-    will be shorten.
+    a long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
-    The parameter stride specifies the intervals at which to apply the pooling
-    operation. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
    The example usage is:
@@ -1435,16 +1446,16 @@ def pooling_layer(input,
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
+    :param pooling_type: Type of pooling. MaxPooling is the default pooling.
-                         SumPooling, SquareRootNPooling.
    :type pooling_type: BasePoolingType | None
    :param stride: The step size between successive pooling regions.
-    :type stride: Int
+    :type stride: int
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
                      parameter is set to True, the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The Extra Attributes for layer, such as dropout.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute | None
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -6618,7 +6629,7 @@ def row_conv_layer(input,
    .. math::
        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
-                  \quad \text{for} \quad  (1 \leq i \leq d)
+                  \quad \\text{for} \quad  (1 \leq i \leq d)
    Note:
        The `context_len` is `k + 1`. That is to say, the lookahead step
@@ -6767,7 +6778,7 @@ def gated_unit_layer(input,
    The gated unit layer implements a simple gating mechanism over the input.
    The input :math:`X` is first projected into a new space :math:`X'`, and
    it is also used to produce a gate weight :math:`\sigma`. Element-wise
-    product between :match:`X'` and :math:`\sigma` is finally returned.
+    product between :math:`X'` and :math:`\sigma` is finally returned.
    Reference:
        `Language Modeling with Gated Convolutional Networks
@@ -7463,7 +7474,7 @@ def factorization_machine(input,
    Factorization Machine with the formula:
    .. math::
-        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
    Note:
        X is the input vector with size n. V is the factor matrix. Each row of V

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -15,14 +15,14 @@ import backward
 import regularizer
 from param_attr import ParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, GPUPlace
+from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
 import clip
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
-    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
+    'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr'
    'DataFeeder', 'clip', 'DistributeTranspiler'
 ]
@@ -42,5 +42,10 @@ def __read_gflags_from_env__():
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
+    if core.is_compile_gpu():
+        core.init_devices(["CPU", "GPU:0"])
+    else:
+        core.init_devices(["CPU"])
 __read_gflags_from_env__()
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -47,13 +47,14 @@ class Executor(object):
            act_places.append(p)
        # TODO(dzhwinter) : consider that our fluid tests all written in 
-        # GPUPlace(gpu_id), this will be changed in next PR.
+        # CUDAPlace(gpu_id), this will be changed in the future
        if core.is_compile_gpu():
            core.init_devices(["CPU", "GPU:0"])
        else:
            core.init_devices(["CPU"])
-        self.executor = core.Executor(act_places)
+        # TODO(dzhwinter) : only use the first place
+        self.executor = core.Executor(act_places[0])
        self.places = places
    def aslodtensor(self, data):

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -17,6 +17,10 @@ TEMP_VAR_NAME = core.kTempVarName()
 GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+USE_CPU = core.kUseCPU()
+USE_CUDNN = core.kUseMKLDNN()
+USE_MKLDNN = core.kUseMKLDNN()
 def grad_var_name(var_name):
    """

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -163,8 +163,9 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
    Examples:
        .. code-block:: python
+          dict_size = len(dataset.ids)
          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
-          fc = fluid.layers.embedding(input=data, size=16)
+          fc = fluid.layers.embedding(input=data, size=[dict_size, 16])
    """
    helper = LayerHelper('embedding', **locals())

--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
 import paddle.v2.fluid.core as core
 from contextlib import contextmanager
+import os
 __all__ = ['CudaProfiler']
@@ -30,17 +31,21 @@ def cuda_profiler(output_file, output_mode=None, config=None):
            written into this file.
        output_mode (string) : The output mode has Key-Value pair format and
            Comma separated values format. It should be 'kvp' or 'csv'.
-        config (string) : The profiler options and counters can refer to
+        config (list of string) : The profiler options and counters can refer
-            "Compute Command Line Profiler User Guide".
+            to "Compute Command Line Profiler User Guide".
    """
    if output_mode is None:
        output_mode = 'csv'
    if output_mode not in ['kvp', 'csv']:
        raise ValueError("The output mode must be 'kvp' or 'csv'.")
    config = NVPROF_CONFIG if config is None else config
-    core.nvprof_init(output_file, output_mode, config)
+    config_file = 'nvprof_config_file'
+    with open(config_file, 'wb') as fp:
+        fp.writelines(["%s\n" % item for item in config])
+    core.nvprof_init(output_file, output_mode, config_file)
    # Enables profiler collection by the active CUDA profiling tool.
    core.nvprof_start()
    yield
    # Disables profiler collection.
    core.nvprof_stop()
+    os.remove(config_file)
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -125,10 +125,11 @@ def model():
    # need cos sim
    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
    label = layers.data(name='score', shape=[1], dtype='float32')
-    square_cost = layers.square_error_cost(input=inference, label=label)
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
    avg_cost = layers.mean(x=square_cost)
@@ -141,7 +142,7 @@ def main():
    opts = sgd_optimizer.minimize(cost)
    if USE_GPU:
-        place = core.GPUPlace(0)
+        place = core.CUDAPlace(0)
    else:
        place = core.CPUPlace()

--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -90,12 +90,10 @@ def get_numeric_gradient(scope,
    def product(dim):
        return reduce(lambda a, b: a * b, dim, 1)
-    ctx = core.DeviceContext.create(core.CPUPlace())
    def get_output():
        sum = []
        for output_name in output_names:
-            op.run(scope, ctx)
+            op.run(scope, core.CPUPlace())
            sum.append(
                np.array(scope.find_var(output_name).get_tensor()).mean())
        return np.array(sum).mean()
@@ -318,7 +316,7 @@ class OpTest(unittest.TestCase):
    def check_output(self, atol=1e-5):
        places = [core.CPUPlace()]
        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_output_with_place(place, atol)
@@ -381,7 +379,7 @@ class OpTest(unittest.TestCase):
                               "Gradient Check On %s" % str(cpu_place))
        if core.is_compile_gpu() and self.op.support_gpu():
-            gpu_place = core.GPUPlace(0)
+            gpu_place = core.CUDAPlace(0)
            gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place,
                                                    output_names, no_grad_set)

--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase):
            LearningRate='LearningRate',
            epsilon=2.0)
-        ctx = core.DeviceContext.create(place)
+        adagrad_op.run(scope, place)
-        adagrad_op.run(scope, ctx)
        # get and compare moment result
        moment_result_array = np.array(moment)
@@ -168,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase):
    def test_sparse_adagrad(self):
        places = [core.CPUPlace()]
        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_with_place(place)

--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -296,8 +296,7 @@ class TestBatchNormOp(OpTest):
                momentum=momentum,
                epsilon=epsilon)
-            ctx = core.DeviceContext.create(place)
+            batch_norm_op.run(scope, place)
-            batch_norm_op.run(scope, ctx)
            # check forward result
            self.__assert_close(y_tensor, y_out, "y_out")
@@ -305,7 +304,7 @@ class TestBatchNormOp(OpTest):
            self.__assert_close(saved_variance_tensor, saved_variance,
                                "saved_variance")
            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.GPUPlace):
+            if isinstance(place, core.CUDAPlace):
                atol = 5e-2
            else:
                atol = 1e-4
@@ -320,7 +319,7 @@ class TestBatchNormOp(OpTest):
                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
                place,
                feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, ctx)
+            batch_norm_op_grad.run(scope, place)
            x_grad_tensor = create_or_get_tensor(scope,
                                                 grad_var_name("x_val"), None,
@@ -340,7 +339,7 @@ class TestBatchNormOp(OpTest):
        places = [core.CPUPlace()]
        if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
            core.init_devices(["CPU", "GPU:0"])
        else:

--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
            SentenceIds="sentence_ids",
            SentenceScores="sentence_scores")
-        ctx = core.DeviceContext.create(self.cpu_place)
+        beam_search_decode_op.run(self.scope, self.cpu_place)
-        beam_search_decode_op.run(self.scope, ctx)
        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
        self.assertEqual(sentence_ids.lod(), expected_lod)

--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data):
 class BeamSearchOpTester(unittest.TestCase):
    def setUp(self):
        self.scope = core.Scope()
-        self.ctx = core.DeviceContext.create(core.CPUPlace())
        self._create_ids()
        self._create_scores()
        self._create_pre_ids()
@@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase):
            level=0,
            beam_size=2,
            end_id=0, )
-        op.run(self.scope, self.ctx)
+        op.run(self.scope, core.CPUPlace())
        selected_ids = self.scope.find_var("selected_ids").get_tensor()
        print 'selected_ids', np.array(selected_ids)
        print 'lod', selected_ids.lod()

--- a/python/paddle/v2/fluid/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase):
        self.create_global_variables()
        self.create_cond_op()
        self.create_sub_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
+        self.condop.run(self.scope, core.CPUPlace())
-        self.condop.run(self.scope, ctx)
        return np.array(self.scope.find_var("Out").get_tensor())
    def create_global_variables(self):

--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -20,11 +20,10 @@ class TestGaussianRandomOp(unittest.TestCase):
    def test_gpu(self):
        if core.is_compile_gpu():
-            self.gaussian_random_test(place=fluid.GPUPlace(0))
+            self.gaussian_random_test(place=fluid.CUDAPlace(0))
    def gaussian_random_test(self, place):
-        context = core.DeviceContext.create(place)
        program = fluid.Program()
        block = program.global_block()
        vout = block.create_var(name="Out")

--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase):
    def one_case(self, input, target):
        op = Operator(type="is_empty", X=input, Out="out")
-        ctx = core.DeviceContext.create(core.CPUPlace())
+        op.run(self.scope, core.CPUPlace())
-        op.run(self.scope, ctx)
        out = self.scope.var("out").get_tensor()
        self.assertEqual(np.array(out)[0], target)

--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -3,6 +3,7 @@ import numpy as np
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.profiler as profiler
 import paddle.v2.fluid.layers as layers
+import os
 class TestProfiler(unittest.TestCase):
@@ -14,14 +15,16 @@ class TestProfiler(unittest.TestCase):
        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-        place = fluid.GPUPlace(0)
+        place = fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
            for i in range(epoc):
                input = np.random.random(dshape).astype('float32')
                exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
 if __name__ == '__main__':

--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase):
            Grad='Grad',
            ParamOut='Param',
            LearningRate='LearningRate')
-        ctx = core.DeviceContext.create(place)
+        sgd_op.run(scope, place)
-        sgd_op.run(scope, ctx)
        # get and compare result
        result_array = np.array(param)
@@ -79,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase):
    def test_sparse_sgd(self):
        places = [core.CPUPlace()]
        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
+            places.append(core.CUDAPlace(0))
        for place in places:
            self.check_with_place(place)

--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -23,10 +23,9 @@ class TestUniformRandomOp(unittest.TestCase):
    def test_gpu(self):
        if core.is_compile_gpu():
-            self.uniform_random_test(place=core.GPUPlace(0))
+            self.uniform_random_test(place=core.CUDAPlace(0))
    def uniform_random_test(self, place):
-        context = core.DeviceContext.create(place)
        program = fluid.Program()
        block = program.global_block()
        vout = block.create_var(name="Out")