diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 34fd348893058980964d723490d9cc220a157b5a..ef31c252038ce18655913c0f41343fe6dc7dbb86 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 44afb5d4ee6f961b54b6ee0d6bd0bc42a6d422e4..5218d89d543e17568236f6f72b47475048484d74 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -44,7 +44,19 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
   SetDeviceId(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-  eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
+  // TODO (qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
+  // here will cause segment fault. We must implement a class derived from
+  // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
+  // later. Please refer to the implementation of class EigenCudaStreamDevice
+  // in TensorFlow.
+  //
+  // We find that CUDA 7 introduces a new option, the per-thread default stream,
+  // that has two effects. Please refer to https://devblogs.nvidia.com/
+  // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
+  //
+  // So, we decide to use default stream and add –default-stream per-thread nvcc
+  // flag. Than, two threads with two CUDADeviceContexts will run parallelly.
+  eigen_stream_.reset(new Eigen::CudaStreamDevice());
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }