diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 3c972368b615a931e5629df48a3f5fff650cd670..937b26f807545cc8cf844787fc9e94c18499ba77 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -42,3 +42,5 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
         naive_managed_allocator
         aligned_allocator
         cuda_device_guard)
+
+nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e4d690c296cfe9aa273c9b94688b44ef62bf5e97
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+struct FillZero {
+ public:
+  float* ptr_;
+
+  __device__ void operator()(size_t i) { ptr_[i] = 0.0f; }
+};
+
+namespace paddle {
+TEST(Eigen, main) {
+  framework::Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  float* ptr = tensor.mutable_data<float>({10, 10}, gpu);
+  auto& dev_ctx = *reinterpret_cast<platform::CUDADeviceContext*>(
+      platform::DeviceContextPool::Instance().Get(gpu));
+  PADDLE_ENFORCE(cudaMemset(ptr, 0, sizeof(float) * 100));
+
+  platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, 100);
+  for_range(FillZero{ptr});
+  dev_ctx.Wait();
+
+  auto eigen_vec = framework::EigenVector<float>::Flatten(tensor);
+  auto& eigen_dev = *dev_ctx.eigen_device();
+  eigen_vec.device(eigen_dev) = eigen_vec.constant(0.0f);
+}
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 48b5f45d7761b452c6010e29fc30c2657d0296a7..bfd5f959faca56168cd0acfd31d23cdf7cbbb965 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -46,7 +46,6 @@ class AllocatorFacadePrivate {
   }
 
   AllocatorFacadePrivate() {
-    std::cout << "Init Allocator Facade" << std::endl;
     InitCPUAllocator();
     InitCUDAAllocator();
   }
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index bf9aced57fe88267eb00168808b1f930eb456080..7b477c53ea2ead54073658a015ca7dc5a41309eb 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -31,7 +31,6 @@ std::unique_ptr<Allocation> CUDAAllocator::Allocate(size_t size, Attr attr) {
         "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
         status, cudaGetErrorString(status)));
   }
-
   return std::unique_ptr<Allocation>(
       new CUDAAllocation(ptr, size, platform::Place(place_)));
 }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 91101356436c26171eaca2fe01dfd4d937e71717..0f7ce471f0f40edb3b34c4fdad9c2dbdc278505e 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -72,7 +72,7 @@ cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 if(WITH_GPU)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 0b97f5123a8dcf01da9a50acb1a1fad399a42c84..7d6c3412ce31f798354159ab41569f7b1bfcbfc6 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
-
 #include <set>
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/platform/cuda_device_guard.h"
 
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
@@ -205,7 +205,7 @@ class CudnnHolder {
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     : place_(place), cudnn_holder_(nullptr) {
-  SetDeviceId(place_.device);
+  CUDADeviceGuard guard(place_.device);
   compute_capability = GetCUDAComputeCapability(place_.device);
   multi_process = GetCUDAMultiProcessors(place_.device);
   max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 4c99f4be321160caf0ee2f89a655bdfb933408e3..25a693ab95f1b360a532527e7838dca0b952f294 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -64,7 +65,7 @@ void InitP2P(std::vector<int> devices) {
           LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
                        << " to " << devices[j];
         } else {
-          cudaSetDevice(devices[i]);
+          platform::CUDADeviceGuard guard(devices[i]);
           cudaDeviceEnablePeerAccess(devices[j], 0);
         }
       }