fix bug in CUDADeviceContext

d962c2a9 · qijun · 44486b6f · d962c2a9 · d962c2a9
隐藏空白更改
内联并排

Showing with 14 addition and 2 deletion

cmake/flags.cmake cmake/flags.cmake +1 -1

paddle/platform/device_context.cc paddle/platform/device_context.cc +13 -1

未找到文件。
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)

 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)

 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -44,7 +44,19 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
  SetDeviceId(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-  eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
+  // TODO (qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
+  // here will cause segment fault. We must implement a class derived from
+  // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
+  // later. Please refer to the implementation of class EigenCudaStreamDevice
+  // in TensorFlow.
+  //
+  // We find that CUDA 7 introduces a new option, the per-thread default stream,
+  // that has two effects. Please refer to https://devblogs.nvidia.com/
+  // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
+  //
+  // So, we decide to use default stream and add –default-stream per-thread nvcc
+  // flag. Than, two threads with two CUDADeviceContexts will run parallelly.
+  eigen_stream_.reset(new Eigen::CudaStreamDevice());
  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }