diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8035d93bfec75b20a54c5af0521ab724cafba8ca..9cc4233e43267472d405c3e4e617f0782e1430ea 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc)
-cc_library(memcpy SRCS memcpy.cc DEPS device_context)
+cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 098931c887479ce6f1afc8b90e4003758d88c018..aaab1142ca18d3319469a4d685fde9d30929113f 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -35,7 +35,7 @@ void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                   platform::GPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GPUPlaceGuard g(src_place.device);
+  platform::SetDeviceId(src_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
@@ -45,7 +45,7 @@ void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GPUPlaceGuard g(dst_place.device);
+  platform::SetDeviceId(dst_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
@@ -56,7 +56,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
   if (dst_place == src_place) {
-    platform::GPUPlaceGuard g(src_place.device);
+    platform::SetDeviceId(src_place.device);
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
   } else {
     platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 9c1d94e9e703caf2db92ca4a8eac975317e6b945..44afb5d4ee6f961b54b6ee0d6bd0bc42a6d422e4 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -20,12 +20,96 @@ Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
   return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
+CPUDeviceContext::CPUDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+
 #ifndef PADDLE_ONLY_CPU
+
 template <>
 Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
   return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
-#endif
+
+CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+  SetDeviceId(place_.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+CUDADeviceContext::~CUDADeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  if (cublas_handle_) {
+    PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  }
+
+  if (cudnn_handle_) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
+
+  if (curand_generator_) {
+    PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
+  }
+  eigen_stream_.reset();
+  eigen_device_.reset();
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+Place CUDADeviceContext::GetPlace() const { return place_; }
+
+cudaStream_t CUDADeviceContext::stream() const { return stream_; }
+
+void CUDADeviceContext::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+}
+
+Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+cublasHandle_t CUDADeviceContext::cublas_handle() {
+  if (!cublas_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+    PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
+  }
+  return cublas_handle_;
+}
+
+cudnnHandle_t CUDADeviceContext::cudnn_handle() {
+  if (!cudnn_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  }
+  return cudnn_handle_;
+}
+
+curandGenerator_t CUDADeviceContext::curand_generator() {
+  if (!curand_generator_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
+                                                  CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(
+        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
+    PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
+  }
+  return curand_generator_;
+}
+
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 5a366dccdc080ce61cf45ea2e22e23b703313682..57035b335f2b5c55578b9da7bdcdc939b1a83efa 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -39,14 +39,13 @@ class DeviceContext {
 
 class CPUDeviceContext : public DeviceContext {
  public:
-  CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
+  CPUDeviceContext();
+  CPUDeviceContext(CPUPlace);
+  virtual ~CPUDeviceContext() {}
 
-  Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
+  Eigen::DefaultDevice* eigen_device() const;
 
-  Place GetPlace() const override {
-    Place retv = CPUPlace();
-    return retv;
-  }
+  Place GetPlace() const override;
 
  private:
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
@@ -54,119 +53,51 @@ class CPUDeviceContext : public DeviceContext {
 
 #ifndef PADDLE_ONLY_CPU
 
-class GPUPlaceGuard {
+class CUDADeviceContext : public DeviceContext {
  public:
-  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
-    if (previous_ != new_place) {
-      paddle::platform::SetDeviceId(new_place.device);
-    }
-  }
+  explicit CUDADeviceContext(GPUPlace);
+  virtual ~CUDADeviceContext();
 
-  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const;
 
- private:
-  GPUPlace previous_;
-};
+  /*! \brief  Return CUDA stream in the device context. */
+  cudaStream_t stream() const;
 
-class CUDADeviceContext : public DeviceContext {
- public:
-  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
-    GPUPlaceGuard guard(gpu_place_);
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_), "cudaStreamCreate failed");
-    eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
-    eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  }
-
-  Place GetPlace() const override {
-    Place retv = GPUPlace();
-    return retv;
-  }
-
-  void Wait() {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_),
-                   "cudaStreamSynchronize failed");
-  }
-
-  cudaStream_t stream() const { return stream_; }
-
-  Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
-
-  cublasHandle_t cublas_handle() {
-    if (!blas_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_),
-                     "cublasCreate failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::cublasSetStream(blas_handle_, stream_),
-          "cublasSetStream failed");
-    }
-    return blas_handle_;
-  }
-
-  cudnnHandle_t cudnn_handle() {
-    if (!dnn_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_),
-                     "cudnnCreate failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::cudnnSetStream(dnn_handle_, stream_),
-          "cudnnSetStream failed");
-    }
-    return dnn_handle_;
-  }
-
-  curandGenerator_t curand_generator() {
-    if (!rand_generator_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT),
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_),
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetStream(rand_generator_, stream_),
-          "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
-
-  ~CUDADeviceContext() {
-    Wait();
-    if (blas_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_),
-                     "cublasDestroy failed");
-    }
-
-    if (dnn_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_),
-                     "cudnnDestroy failed");
-    }
-
-    if (rand_generator_) {
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandDestroyGenerator(rand_generator_),
-          "curandDestroyGenerator failed");
-    }
-    eigen_stream_.reset();
-    eigen_device_.reset();
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_), "cudaStreamDestroy failed");
-  }
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const override;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  // clang-format off
+  /*! \brief  Return cublas handle in the device context. */
+  cublasHandle_t    cublas_handle   ();
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t     cudnn_handle    ();
+
+  /*! \brief  Return curand handle in the device context. */
+  curandGenerator_t curand_generator();
+  // clang-format on
 
  private:
-  GPUPlace gpu_place_;
-  cudaStream_t stream_;
+  GPUPlace place_;
 
-  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+ private:
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
 
-  cublasHandle_t blas_handle_{nullptr};
+ private:
+  uint64_t seed_;
 
-  cudnnHandle_t dnn_handle_{nullptr};
+  cudaStream_t stream_;
 
-  int random_seed_;
-  curandGenerator_t rand_generator_{nullptr};
+  // clang-format off
+  cudnnHandle_t     cudnn_handle_     = nullptr;
+  cublasHandle_t    cublas_handle_    = nullptr;
+  curandGenerator_t curand_generator_ = nullptr;
+  // clang-format on
 };
 
 #endif
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index a3a10fc07fed1cb245dc721d18ea927ea25393a1..fd4adbd9deca12ad6c3a59cfd5d30fb0cb6fcf98 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -58,11 +58,6 @@ struct EnforceNotMet : public std::exception {
 // For more details, please check https://stackoverflow.com/a/43870188/724872.
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 
-template <typename T>
-inline void throw_on_error(T e) {
-  throw_on_error(e, "");
-}
-
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     int stat, const Args&... args) {
@@ -132,6 +127,11 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 
 #endif  // PADDLE_ONLY_CPU
 
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
 #define PADDLE_THROW(...)                                      \
   do {                                                         \
     throw ::paddle::platform::EnforceNotMet(                   \