diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 58e78e9a6a4877f7b3fef61b1715d4e27d6ead79..898b3a990d927583026cd3e9ca8fba9202b72dac 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -44,8 +44,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto ${GLOB_OP_LIB})
-cc_test(executor_test SRCS executor_test.cc DEPS executor)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context_manager scope framework_proto ${GLOB_OP_LIB})
+if(WITH_GPU)
+    nv_test(executor_test SRCS executor_test.cc DEPS executor)
+else()
+    cc_test(executor_test SRCS executor_test.cc DEPS executor)
+endif()
 
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 94b9b3b350910f0853a95ecd6e0e00af5ac47f8c..717f9bf81a372dec0c5933a874891f52f889b4f8 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -22,9 +22,21 @@ namespace paddle {
 namespace framework {
 
 Executor::Executor(const std::vector<platform::Place>& places) {
-  devices_.resize(places.size());
+  device_contexts_.resize(places.size());
   for (size_t i = 0; i < places.size(); i++) {
-    devices_[i] = platform::GetDevice(places[i]);
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_[i] = platform::DeviceContextManager::Get()
+                                ->GetDeviceContext<platform::CPUPlace>(
+                                    boost::get<platform::CPUPlace>(places[i]));
+    } else {
+#ifndef PADDLE_ONLY_CPU
+      device_contexts_[i] = platform::DeviceContextManager::Get()
+                                ->GetDeviceContext<platform::GPUPlace>(
+                                    boost::get<platform::GPUPlace>(places[i]));
+#else
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#endif
+    }
   }
 }
 
@@ -34,37 +46,25 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope,
   // TODO(tonyyang-svail):
   //    - only runs the first block
   //    - only runs on the first device
+  Scope& local_scope = scope->NewScope();
+
   auto& block = pdesc.blocks(0);
-  auto& device = devices_[0];
+  auto& device_context = device_contexts_[0];
 
   for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
+    local_scope.NewVar(var.name());
   }
 
   // std::vector<op_ptr> ops;
   for (auto& op_desc : block.ops()) {
     auto op = framework::OpRegistry::CreateOp(op_desc);
-    // op->InferShape(*scope);
-    op->Run(*scope, *device->cpu_device_context);
+    // InferShape is now doing inside Run method.
+    op->Run(local_scope, *device_context);
   }
 
   // TODO(tonyyang-svail): need to test gpu device
-  //   device_->cpu_device_context->Wait();
-  // #ifndef PADDLE_ONLY_CPU
-  //   if (device_->cuda_device_context) {
-  //     device_->cuda_device_context->Wait();
-  //   }
-  // #endif
-
-  Scope& local_scope = scope->NewScope();
-  local_scope.NewVar();
-  for (auto device : devices_) {
-    device->cpu_device_context->Wait();
-#ifndef PADDLE_ONLY_CPU
-    if (device->cuda_device_context) {
-      device->cuda_device_context->Wait();
-    }
-#endif
+  for (auto device_context : device_contexts_) {
+    device_context->Wait();
   }
 }
 
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index cdb80bc10413d7bc3f4e42065ec4545c625c1b72..795b8ffdab3bf204214d323a76f7ce8df7af8054 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
-#include "paddle/platform/device.h"
+#include "paddle/platform/device_context_manager.h"
 
 namespace paddle {
 namespace framework {
@@ -30,7 +30,7 @@ class Executor {
   void Run(const ProgramDesc&, Scope*, std::vector<Tensor>*);
 
  private:
-  std::vector<platform::Device*> devices_;
+  std::vector<platform::DeviceContext*> device_contexts_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 11255af808aaee3e17cec70d41ae30fcbc5d5d18..810ff2a512a4185a5cb9d6921f43ace44fcddf7d 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
-
-#include <gtest/gtest.h>
 #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
@@ -26,52 +24,71 @@ USE_OP(elementwise_add);
 using namespace paddle::platform;
 using namespace paddle::framework;
 
-TEST(Executor, Init) {
-  ProgramDesc pdesc;
-
-  auto root_block = pdesc.add_blocks();
-  root_block->set_idx(0);
-  root_block->set_parent_idx(-1);
-
-  auto a = root_block->add_vars();
-  a->set_name("a");
-  auto a_lt = a->mutable_lod_tensor();
-  a_lt->set_data_type(paddle::framework::DataType::FP32);
-  a_lt->add_dims(640);
-  a_lt->add_dims(640);
-
-  auto b = root_block->add_vars();
-  b->set_name("b");
-  auto b_lt = b->mutable_lod_tensor();
-  b_lt->set_data_type(paddle::framework::DataType::FP32);
-  b_lt->add_dims(640);
-  b_lt->add_dims(640);
-
-  auto c = root_block->add_vars();
-  c->set_name("c");
-  auto c_lt = c->mutable_lod_tensor();
-  c_lt->set_data_type(paddle::framework::DataType::FP32);
-  c_lt->add_dims(640);
-  c_lt->add_dims(640);
-
-  auto op1 = root_block->add_ops();
-  op1->set_type("elementwise_add");
-  auto X = op1->add_inputs();
-  X->set_parameter("X");
-  X->add_arguments("a");
-  auto Y = op1->add_inputs();
-  Y->set_parameter("Y");
-  Y->add_arguments("b");
-
-  CPUPlace cpu_place1, cpu_place2;
+class ExecutorTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    auto root_block = pdesc_.add_blocks();
+    root_block->set_idx(0);
+    root_block->set_parent_idx(-1);
+
+    auto a = root_block->add_vars();
+    a->set_name("a");
+    auto a_lt = a->mutable_lod_tensor();
+    a_lt->set_data_type(paddle::framework::DataType::FP32);
+    a_lt->add_dims(640);
+    a_lt->add_dims(640);
+
+    auto b = root_block->add_vars();
+    b->set_name("b");
+    auto b_lt = b->mutable_lod_tensor();
+    b_lt->set_data_type(paddle::framework::DataType::FP32);
+    b_lt->add_dims(640);
+    b_lt->add_dims(640);
+
+    auto c = root_block->add_vars();
+    c->set_name("c");
+    auto c_lt = c->mutable_lod_tensor();
+    c_lt->set_data_type(paddle::framework::DataType::FP32);
+    c_lt->add_dims(640);
+    c_lt->add_dims(640);
+
+    auto op1 = root_block->add_ops();
+    op1->set_type("elementwise_add");
+    auto X = op1->add_inputs();
+    X->set_parameter("X");
+    X->add_arguments("a");
+    auto Y = op1->add_inputs();
+    Y->set_parameter("Y");
+    Y->add_arguments("b");
+  }
+
+ protected:
+  std::vector<Tensor>* outputs_{nullptr};
+  ProgramDesc pdesc_;
+  Scope scope_;
+};
+
+TEST_F(ExecutorTester, InitCPU) {
   std::vector<Place> places;
+  CPUPlace cpu_place1, cpu_place2;
   places.push_back(cpu_place1);
   places.push_back(cpu_place2);
 
   Executor* executor = new Executor(places);
-  Scope s;
-  std::vector<Tensor>* outputs{nullptr};
-  executor->Run(pdesc, &s, outputs);
+  executor->Run(pdesc_, &scope_, outputs_);
+  delete executor;
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST_F(ExecutorTester, InitGPU) {
+  std::vector<Place> places;
+  GPUPlace gpu_place0(0);
+  GPUPlace gpu_place1(1);
+  places.push_back(gpu_place0);
+  places.push_back(gpu_place1);
 
+  Executor* executor = new Executor(places);
+  executor->Run(pdesc_, &scope_, outputs_);
   delete executor;
 }
+#endif
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index b581937393520ec6a47991c16c093db65a942162..b4ddf721ddb3776406ad44a13bcf9876a905a0df 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -23,7 +23,7 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
     system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
-cc_library(device SRCS device.cc DEPS device_context)
+cc_library(device_context_manager SRCS device_context_manager.cc DEPS device_context)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/device.cc b/paddle/platform/device.cc
deleted file mode 100644
index 7acd87c8c3db8f774de72251b028be10ef34770c..0000000000000000000000000000000000000000
--- a/paddle/platform/device.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/platform/device.h"
-
-namespace paddle {
-namespace platform {
-
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-CPUDeviceContext* GetCPUDeviceContext(const CPUPlace& place) {
-  static std::unique_ptr<CPUDeviceContext> g_cpu_device_context =
-      make_unique<CPUDeviceContext>(place);
-  return g_cpu_device_context.get();
-}
-
-#ifndef PADDLE_ONLY_CPU
-CUDADeviceContext* GetCUDADeviceContext(const GPUPlace& place) {
-  static std::unique_ptr<CUDADeviceContext> g_cuda_device_context =
-      make_unique<CUDADeviceContext>(place);
-  return g_cuda_device_context.get();
-}
-#endif
-
-Device* GetDevice(const Place& place) {
-  CPUPlace cpu_place;
-#ifndef PADDLE_ONLY_CPU
-  if (is_gpu_place(place)) {
-    GPUPlace gpu_place = boost::get<GPUPlace>(place);
-    static std::unique_ptr<Device> g_device = make_unique<Device>(
-        GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place));
-    return g_device.get();
-  } else {
-    static std::unique_ptr<Device> g_device =
-        make_unique<Device>(GetCPUDeviceContext(cpu_place), nullptr);
-    return g_device.get();
-  }
-#else
-  static std::unique_ptr<Device> g_device =
-      make_unique<Device>(GetCPUDeviceContext(cpu_place));
-  return g_device.get();
-#endif
-}
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/platform/device_context_manager.cc b/paddle/platform/device_context_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..156d317c8a9e24ed7ace95429d0c7dc534210ece
--- /dev/null
+++ b/paddle/platform/device_context_manager.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context_manager.h"
+
+namespace paddle {
+namespace platform {
+
+DeviceContextManager::DeviceContextManager() {
+#ifndef PADDLE_ONLY_CPU
+  device_count_ = GetDeviceCount();
+  cuda_contexts_.reserve(device_count_);
+  for (int i = 0; i < device_count_; i++) {
+    cuda_contexts_[i] = nullptr;
+  }
+#endif
+}
+
+template <>
+CPUDeviceContext* DeviceContextManager::GetDeviceContext<
+    CPUPlace, CPUDeviceContext>(const CPUPlace& place) {
+  if (!cpu_context_) {
+    cpu_context_ = new CPUDeviceContext(place);
+  }
+  return cpu_context_;
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+CUDADeviceContext* DeviceContextManager::GetDeviceContext<
+    GPUPlace, CUDADeviceContext>(const GPUPlace& place) {
+  int gpu_id = place.device;
+  PADDLE_ENFORCE(gpu_id < device_count_,
+                 "GPU device id must less than device count");
+  SetDeviceId(gpu_id);
+  if (!cuda_contexts_[gpu_id]) {
+    cuda_contexts_[gpu_id] = new CUDADeviceContext(place);
+  }
+  return cuda_contexts_[gpu_id];
+}
+#endif
+
+DeviceContextManager::~DeviceContextManager() {
+  if (cpu_context_) {
+    delete cpu_context_;
+  }
+#ifndef PADDLE_ONLY_CPU
+  for (int i = 0; i < device_count_; i++) {
+    if (cuda_contexts_[i]) {
+      delete cuda_contexts_[i];
+    }
+  }
+#endif
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device.h b/paddle/platform/device_context_manager.h
similarity index 52%
rename from paddle/platform/device.h
rename to paddle/platform/device_context_manager.h
index b1bb8073cf15dccbcb7ef20a9b412385ad666a4d..da15808a6079bbae30ae324277c5cd657ee82155 100644
--- a/paddle/platform/device.h
+++ b/paddle/platform/device_context_manager.h
@@ -13,33 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include "paddle/platform/device_context.h"
-#include "paddle/platform/place.h"
 
 namespace paddle {
 namespace platform {
 
-struct Device {
-  CPUDeviceContext* cpu_device_context;
-#ifndef PADDLE_ONLY_CPU
-  CUDADeviceContext* cuda_device_context;
-#endif
+template <typename T>
+struct Converter;
+
+template <>
+struct Converter<CPUPlace> {
+  using DeviceContextType = CPUDeviceContext;
+};
 
 #ifndef PADDLE_ONLY_CPU
-  Device(CPUDeviceContext* cpu, CUDADeviceContext* gpu)
-      : cpu_device_context(cpu), cuda_device_context(gpu) {}
-#else
-  explicit Device(CPUDeviceContext* cpu) : cpu_device_context(cpu) {}
-#endif
+template <>
+struct Converter<GPUPlace> {
+  using DeviceContextType = CUDADeviceContext;
 };
+#endif
+
+class DeviceContextManager {
+ public:
+  DeviceContextManager();
+  ~DeviceContextManager();
+
+  template <typename PlaceType, typename DeviceType = typename Converter<
+                                    PlaceType>::DeviceContextType>
+  DeviceType* GetDeviceContext(const PlaceType& place);
 
-CPUDeviceContext* GetCPUDeviceContext(const platform::CPUPlace& place);
+  static DeviceContextManager* Get() {
+    static DeviceContextManager inst;
+    return &inst;
+  }
 
+ private:
+  CPUDeviceContext* cpu_context_;
 #ifndef PADDLE_ONLY_CPU
-CUDADeviceContext* GetCUDADeviceContext(const platform::GPUPlace& place);
+  int device_count_;
+  std::vector<CUDADeviceContext*> cuda_contexts_;
 #endif
-
-Device* GetDevice(const platform::Place& place);
+};
 }  // namespace platform
 }  // namespace paddle