提交 5364b394 编写于 作者: Q qijun

use cuda default stream

上级 d962c2a9
...@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) ...@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
ExternalProject_Add( ExternalProject_Add(
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
# for latest version, please get from official website
# URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
# URL_MD5 "1a47e78efe365a97de0c022d127607c3"
# for no-ssl http support, please get from bazel's mirror
# URL "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
# URL_MD5 "4645c66075982da6fa0bcf6b20f3e8f7"
# get from github mirror
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
GIT_TAG "a46d2e7337c4656f00abe54a8115f6d76153a048" GIT_TAG "master"
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -83,14 +83,13 @@ inline void Tensor::ShareDataWith(const Tensor& src) { ...@@ -83,14 +83,13 @@ inline void Tensor::ShareDataWith(const Tensor& src) {
template <typename T> template <typename T>
inline void Tensor::CopyFrom(const Tensor& src, inline void Tensor::CopyFrom(const Tensor& src,
const platform::CPUDeviceContext& ctx) { const platform::CPUPlace& dst_place) {
src.check_memory_size<T>(); src.check_memory_size<T>();
Resize(src.dims()); Resize(src.dims());
auto src_place = src.holder_->place(); auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>()); auto src_ptr = static_cast<const void*>(src.data<T>());
auto dst_place = ctx.GetPlace();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place)); auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
auto size = product(src.dims_) * sizeof(T); auto size = product(src.dims_) * sizeof(T);
...@@ -110,26 +109,23 @@ inline void Tensor::CopyFrom(const Tensor& src, ...@@ -110,26 +109,23 @@ inline void Tensor::CopyFrom(const Tensor& src,
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
template <typename T> template <typename T>
inline void Tensor::CopyFrom(const Tensor& src, inline void Tensor::CopyFrom(const Tensor& src,
const platform::CUDADeviceContext& ctx) { const platform::GPUPlace& dst_place) {
src.check_memory_size<T>(); src.check_memory_size<T>();
Resize(src.dims()); Resize(src.dims());
auto src_place = src.holder_->place(); auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>()); auto src_ptr = static_cast<const void*>(src.data<T>());
auto dst_place = ctx.GetPlace();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place)); auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
auto size = product(src.dims_) * sizeof(T); auto size = product(src.dims_) * sizeof(T);
if (platform::is_cpu_place(src_place)) { if (platform::is_cpu_place(src_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size, boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
ctx.stream());
} else if (platform::is_gpu_place(src_place)) { } else if (platform::is_gpu_place(src_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
boost::get<platform::GPUPlace>(src_place), src_ptr, size, boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
ctx.stream());
} }
} }
#endif #endif
......
...@@ -198,8 +198,8 @@ TEST(Tensor, CopyFrom) { ...@@ -198,8 +198,8 @@ TEST(Tensor, CopyFrom) {
int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
auto* cpu_ctx = new paddle::platform::CPUDeviceContext(); auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_ctx); dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
...@@ -208,7 +208,7 @@ TEST(Tensor, CopyFrom) { ...@@ -208,7 +208,7 @@ TEST(Tensor, CopyFrom) {
} }
Tensor slice_tensor = src_tensor.Slice<int>(1, 2); Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_ctx); dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
const int* slice_ptr = slice_tensor.data<int>(); const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>(); dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr); ASSERT_NE(dst_ptr, slice_ptr);
...@@ -228,12 +228,12 @@ TEST(Tensor, CopyFrom) { ...@@ -228,12 +228,12 @@ TEST(Tensor, CopyFrom) {
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_ctx = new paddle::platform::CUDADeviceContext(0); auto gpu_place = new paddle::platform::GPUPlace(0);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_ctx); gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
auto cpu_ctx = new paddle::platform::CPUDeviceContext(); auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx); dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
// Compare Tensors // Compare Tensors
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
...@@ -245,10 +245,10 @@ TEST(Tensor, CopyFrom) { ...@@ -245,10 +245,10 @@ TEST(Tensor, CopyFrom) {
Tensor slice_tensor = src_tensor.Slice<int>(1, 2); Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
// CPU Slice Tensor to GPU Tensor // CPU Slice Tensor to GPU Tensor
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_ctx); gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx); dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
// Compare Slice Tensors // Compare Slice Tensors
const int* slice_ptr = slice_tensor.data<int>(); const int* slice_ptr = slice_tensor.data<int>();
......
...@@ -34,7 +34,7 @@ void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place, ...@@ -34,7 +34,7 @@ void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::GPUPlace src_place, platform::GPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
cudaStream_t stream) { cudaStream_t stream = 0) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} }
...@@ -44,7 +44,7 @@ void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place, ...@@ -44,7 +44,7 @@ void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
cudaStream_t stream) { cudaStream_t stream = 0) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} }
...@@ -54,7 +54,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place, ...@@ -54,7 +54,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst, void* dst,
platform::GPUPlace src_place, platform::GPUPlace src_place,
const void* src, size_t num, const void* src, size_t num,
cudaStream_t stream) { cudaStream_t stream = 0) {
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
......
...@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); ...@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
*/ */
template <typename DstPlace, typename SrcPlace> template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream); cudaStream_t stream = 0);
#endif // PADDLE_ONLY_CPU #endif // PADDLE_ONLY_CPU
......
...@@ -43,7 +43,6 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const { ...@@ -43,7 +43,6 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
// TODO (qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly // TODO (qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
// here will cause segment fault. We must implement a class derived from // here will cause segment fault. We must implement a class derived from
// Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
...@@ -76,15 +75,12 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -76,15 +75,12 @@ CUDADeviceContext::~CUDADeviceContext() {
} }
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
} }
Place CUDADeviceContext::GetPlace() const { return place_; } Place CUDADeviceContext::GetPlace() const { return place_; }
cudaStream_t CUDADeviceContext::stream() const { return stream_; }
void CUDADeviceContext::Wait() const { void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamSynchronize(0));
} }
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
...@@ -95,7 +91,6 @@ cublasHandle_t CUDADeviceContext::cublas_handle() { ...@@ -95,7 +91,6 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
if (!cublas_handle_) { if (!cublas_handle_) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
} }
return cublas_handle_; return cublas_handle_;
} }
...@@ -104,7 +99,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() { ...@@ -104,7 +99,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
if (!cudnn_handle_) { if (!cudnn_handle_) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
} }
return cudnn_handle_; return cudnn_handle_;
} }
...@@ -116,7 +110,6 @@ curandGenerator_t CUDADeviceContext::curand_generator() { ...@@ -116,7 +110,6 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
CURAND_RNG_PSEUDO_DEFAULT)); CURAND_RNG_PSEUDO_DEFAULT));
PADDLE_ENFORCE( PADDLE_ENFORCE(
dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_)); dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
} }
return curand_generator_; return curand_generator_;
} }
......
...@@ -61,9 +61,6 @@ class CUDADeviceContext : public DeviceContext { ...@@ -61,9 +61,6 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
void Wait() const; void Wait() const;
/*! \brief Return CUDA stream in the device context. */
cudaStream_t stream() const;
/*! \brief Return place in the device context. */ /*! \brief Return place in the device context. */
Place GetPlace() const override; Place GetPlace() const override;
...@@ -91,8 +88,6 @@ class CUDADeviceContext : public DeviceContext { ...@@ -91,8 +88,6 @@ class CUDADeviceContext : public DeviceContext {
private: private:
uint64_t seed_; uint64_t seed_;
cudaStream_t stream_;
// clang-format off // clang-format off
cudnnHandle_t cudnn_handle_ = nullptr; cudnnHandle_t cudnn_handle_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr; cublasHandle_t cublas_handle_ = nullptr;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册