Fix SetDevice on init

8e3fdc6e · Yu Yang · 524f6e9b · 8e3fdc6e · 8e3fdc6e · 8e3fdc6e
7 changed file
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -42,3 +42,5 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
        naive_managed_allocator
        aligned_allocator
        cuda_device_guard)
+nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
--- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+struct FillZero {
+ public:
+  float* ptr_;
+  __device__ void operator()(size_t i) { ptr_[i] = 0.0f; }
+};
+namespace paddle {
+TEST(Eigen, main) {
+  framework::Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  float* ptr = tensor.mutable_data<float>({10, 10}, gpu);
+  auto& dev_ctx = *reinterpret_cast<platform::CUDADeviceContext*>(
+      platform::DeviceContextPool::Instance().Get(gpu));
+  PADDLE_ENFORCE(cudaMemset(ptr, 0, sizeof(float) * 100));
+  platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, 100);
+  for_range(FillZero{ptr});
+  dev_ctx.Wait();
+  auto eigen_vec = framework::EigenVector<float>::Flatten(tensor);
+  auto& eigen_dev = *dev_ctx.eigen_device();
+  eigen_vec.device(eigen_dev) = eigen_vec.constant(0.0f);
+}
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -46,7 +46,6 @@ class AllocatorFacadePrivate {
  }
  AllocatorFacadePrivate() {
-    std::cout << "Init Allocator Facade" << std::endl;
    InitCPUAllocator();
    InitCUDAAllocator();
  }

--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -31,7 +31,6 @@ std::unique_ptr<Allocation> CUDAAllocator::Allocate(size_t size, Attr attr) {
        "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
        status, cudaGetErrorString(status)));
  }
  return std::unique_ptr<Allocation>(
      new CUDAAllocation(ptr, size, platform::Place(place_)));
 }

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -72,7 +72,7 @@ cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 if(WITH_GPU)
    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
@@ -205,7 +205,7 @@ class CudnnHolder {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
    : place_(place), cudnn_holder_(nullptr) {
-  SetDeviceId(place_.device);
+  CUDADeviceGuard guard(place_.device);
  compute_capability = GetCUDAComputeCapability(place_.device);
  multi_process = GetCUDAMultiProcessors(place_.device);
  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -64,7 +65,7 @@ void InitP2P(std::vector<int> devices) {
          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
                       << " to " << devices[j];
        } else {
-          cudaSetDevice(devices[i]);
+          platform::CUDADeviceGuard guard(devices[i]);
          cudaDeviceEnablePeerAccess(devices[j], 0);
        }
      }