Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
84b368d2
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
84b368d2
编写于
7月 29, 2017
作者:
Q
QI JUN
提交者:
GitHub
7月 29, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3101 from QiJune/fix_bug_in_CUDADeviceContext
Fix bug in cuda device context
上级
aee0d3ec
303fb789
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
38 addition
and
72 deletion
+38
-72
cmake/external/eigen.cmake
cmake/external/eigen.cmake
+1
-10
cmake/flags.cmake
cmake/flags.cmake
+1
-1
paddle/framework/detail/tensor-inl.h
paddle/framework/detail/tensor-inl.h
+12
-30
paddle/framework/tensor.h
paddle/framework/tensor.h
+1
-8
paddle/framework/tensor_test.cc
paddle/framework/tensor_test.cc
+9
-9
paddle/platform/device_context.cc
paddle/platform/device_context.cc
+14
-9
paddle/platform/device_context.h
paddle/platform/device_context.h
+0
-5
未找到文件。
cmake/external/eigen.cmake
浏览文件 @
84b368d2
...
@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
...
@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
ExternalProject_Add
(
ExternalProject_Add
(
extern_eigen3
extern_eigen3
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
# for latest version, please get from official website
# URL "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
# URL_MD5 "1a47e78efe365a97de0c022d127607c3"
# for no-ssl http support, please get from bazel's mirror
# URL "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
# URL_MD5 "4645c66075982da6fa0bcf6b20f3e8f7"
# get from github mirror
GIT_REPOSITORY
"https://github.com/RLovelett/eigen.git"
GIT_REPOSITORY
"https://github.com/RLovelett/eigen.git"
GIT_TAG
"
a46d2e7337c4656f00abe54a8115f6d76153a048
"
GIT_TAG
"
master
"
PREFIX
${
EIGEN_SOURCE_DIR
}
PREFIX
${
EIGEN_SOURCE_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
CONFIGURE_COMMAND
""
...
...
cmake/flags.cmake
浏览文件 @
84b368d2
...
@@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
...
@@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
# So, don't set these flags here.
LIST
(
APPEND CUDA_NVCC_FLAGS -std=c++11
)
LIST
(
APPEND CUDA_NVCC_FLAGS -std=c++11
--default-stream per-thread
)
LIST
(
APPEND CUDA_NVCC_FLAGS --use_fast_math
)
LIST
(
APPEND CUDA_NVCC_FLAGS --use_fast_math
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
...
...
paddle/framework/detail/tensor-inl.h
浏览文件 @
84b368d2
...
@@ -83,56 +83,38 @@ inline void Tensor::ShareDataWith(const Tensor& src) {
...
@@ -83,56 +83,38 @@ inline void Tensor::ShareDataWith(const Tensor& src) {
template
<
typename
T
>
template
<
typename
T
>
inline
void
Tensor
::
CopyFrom
(
const
Tensor
&
src
,
inline
void
Tensor
::
CopyFrom
(
const
Tensor
&
src
,
const
platform
::
CPUDeviceContext
&
ctx
)
{
const
platform
::
Place
&
dst_place
)
{
src
.
check_memory_size
<
T
>
();
src
.
check_memory_size
<
T
>
();
Resize
(
src
.
dims
());
Resize
(
src
.
dims
());
auto
src_place
=
src
.
holder_
->
place
();
auto
src_place
=
src
.
holder_
->
place
();
auto
src_ptr
=
static_cast
<
const
void
*>
(
src
.
data
<
T
>
());
auto
src_ptr
=
static_cast
<
const
void
*>
(
src
.
data
<
T
>
());
auto
dst_place
=
ctx
.
GetPlace
();
auto
dst_ptr
=
static_cast
<
void
*>
(
mutable_data
<
T
>
(
dst_place
));
auto
dst_ptr
=
static_cast
<
void
*>
(
mutable_data
<
T
>
(
dst_place
));
auto
size
=
product
(
src
.
dims_
)
*
sizeof
(
T
);
auto
size
=
product
(
src
.
dims_
)
*
sizeof
(
T
);
if
(
platform
::
is_cpu_place
(
src_place
))
{
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
)
)
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
}
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
else
if
(
platform
::
is_gpu_place
(
src_place
))
{
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
GPUPlace
>
(
src_place
),
src_ptr
,
size
,
0
);
boost
::
get
<
platform
::
GPUPlace
>
(
src_place
),
src_ptr
,
size
,
0
);
}
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
#endif
platform
::
is_gpu_place
(
dst_place
))
{
}
#ifndef PADDLE_ONLY_CPU
template
<
typename
T
>
inline
void
Tensor
::
CopyFrom
(
const
Tensor
&
src
,
const
platform
::
CUDADeviceContext
&
ctx
)
{
src
.
check_memory_size
<
T
>
();
Resize
(
src
.
dims
());
auto
src_place
=
src
.
holder_
->
place
();
auto
src_ptr
=
static_cast
<
const
void
*>
(
src
.
data
<
T
>
());
auto
dst_place
=
ctx
.
GetPlace
();
auto
dst_ptr
=
static_cast
<
void
*>
(
mutable_data
<
T
>
(
dst_place
));
auto
size
=
product
(
src
.
dims_
)
*
sizeof
(
T
);
if
(
platform
::
is_cpu_place
(
src_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
GPUPlace
>
(
dst_place
),
dst_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
GPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
,
0
);
ctx
.
stream
());
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
}
else
if
(
platform
::
is_gpu_place
(
src
_place
))
{
platform
::
is_gpu_place
(
dst
_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
GPUPlace
>
(
dst_place
),
dst_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
GPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
GPUPlace
>
(
src_place
),
src_ptr
,
size
,
boost
::
get
<
platform
::
GPUPlace
>
(
src_place
),
src_ptr
,
size
,
0
);
ctx
.
stream
());
}
}
}
#endif
#endif
}
template
<
typename
T
>
template
<
typename
T
>
inline
Tensor
Tensor
::
Slice
(
const
int
&
begin_idx
,
const
int
&
end_idx
)
const
{
inline
Tensor
Tensor
::
Slice
(
const
int
&
begin_idx
,
const
int
&
end_idx
)
const
{
...
...
paddle/framework/tensor.h
浏览文件 @
84b368d2
...
@@ -94,14 +94,7 @@ class Tensor {
...
@@ -94,14 +94,7 @@ class Tensor {
* @note CopyFrom supports CPU <-> GPU, GPU <-> GPU.
* @note CopyFrom supports CPU <-> GPU, GPU <-> GPU.
*/
*/
template
<
typename
T
>
template
<
typename
T
>
inline
void
CopyFrom
(
const
Tensor
&
src
,
inline
void
CopyFrom
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
);
const
platform
::
CPUDeviceContext
&
ctx
);
#ifndef PADDLE_ONLY_CPU
template
<
typename
T
>
inline
void
CopyFrom
(
const
Tensor
&
src
,
const
platform
::
CUDADeviceContext
&
ctx
);
#endif
/**
/**
* @brief Return the slice of the tensor.
* @brief Return the slice of the tensor.
...
...
paddle/framework/tensor_test.cc
浏览文件 @
84b368d2
...
@@ -198,8 +198,8 @@ TEST(Tensor, CopyFrom) {
...
@@ -198,8 +198,8 @@ TEST(Tensor, CopyFrom) {
int
arr
[
9
]
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
arr
[
9
]
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
memcpy
(
src_ptr
,
arr
,
9
*
sizeof
(
int
));
memcpy
(
src_ptr
,
arr
,
9
*
sizeof
(
int
));
auto
*
cpu_ctx
=
new
paddle
::
platform
::
CPUDeviceContext
();
auto
cpu_place
=
new
paddle
::
platform
::
CPUPlace
();
dst_tensor
.
CopyFrom
<
int
>
(
src_tensor
,
*
cpu_
ctx
);
dst_tensor
.
CopyFrom
<
int
>
(
src_tensor
,
*
cpu_
place
);
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSERT_NE
(
src_ptr
,
dst_ptr
);
ASSERT_NE
(
src_ptr
,
dst_ptr
);
...
@@ -208,7 +208,7 @@ TEST(Tensor, CopyFrom) {
...
@@ -208,7 +208,7 @@ TEST(Tensor, CopyFrom) {
}
}
Tensor
slice_tensor
=
src_tensor
.
Slice
<
int
>
(
1
,
2
);
Tensor
slice_tensor
=
src_tensor
.
Slice
<
int
>
(
1
,
2
);
dst_tensor
.
CopyFrom
<
int
>
(
slice_tensor
,
*
cpu_
ctx
);
dst_tensor
.
CopyFrom
<
int
>
(
slice_tensor
,
*
cpu_
place
);
const
int
*
slice_ptr
=
slice_tensor
.
data
<
int
>
();
const
int
*
slice_ptr
=
slice_tensor
.
data
<
int
>
();
dst_ptr
=
dst_tensor
.
data
<
int
>
();
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSERT_NE
(
dst_ptr
,
slice_ptr
);
ASSERT_NE
(
dst_ptr
,
slice_ptr
);
...
@@ -228,12 +228,12 @@ TEST(Tensor, CopyFrom) {
...
@@ -228,12 +228,12 @@ TEST(Tensor, CopyFrom) {
memcpy
(
src_ptr
,
arr
,
9
*
sizeof
(
int
));
memcpy
(
src_ptr
,
arr
,
9
*
sizeof
(
int
));
// CPU Tensor to GPU Tensor
// CPU Tensor to GPU Tensor
auto
gpu_
ctx
=
new
paddle
::
platform
::
CUDADeviceContext
(
0
);
auto
gpu_
place
=
new
paddle
::
platform
::
GPUPlace
(
0
);
gpu_tensor
.
CopyFrom
<
int
>
(
src_tensor
,
*
gpu_
ctx
);
gpu_tensor
.
CopyFrom
<
int
>
(
src_tensor
,
*
gpu_
place
);
// GPU Tensor to CPU Tensor
// GPU Tensor to CPU Tensor
auto
cpu_
ctx
=
new
paddle
::
platform
::
CPUDeviceContext
();
auto
cpu_
place
=
new
paddle
::
platform
::
CPUPlace
();
dst_tensor
.
CopyFrom
<
int
>
(
gpu_tensor
,
*
cpu_
ctx
);
dst_tensor
.
CopyFrom
<
int
>
(
gpu_tensor
,
*
cpu_
place
);
// Compare Tensors
// Compare Tensors
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
...
@@ -245,10 +245,10 @@ TEST(Tensor, CopyFrom) {
...
@@ -245,10 +245,10 @@ TEST(Tensor, CopyFrom) {
Tensor
slice_tensor
=
src_tensor
.
Slice
<
int
>
(
1
,
2
);
Tensor
slice_tensor
=
src_tensor
.
Slice
<
int
>
(
1
,
2
);
// CPU Slice Tensor to GPU Tensor
// CPU Slice Tensor to GPU Tensor
gpu_tensor
.
CopyFrom
<
int
>
(
slice_tensor
,
*
gpu_
ctx
);
gpu_tensor
.
CopyFrom
<
int
>
(
slice_tensor
,
*
gpu_
place
);
// GPU Tensor to CPU Tensor
// GPU Tensor to CPU Tensor
dst_tensor
.
CopyFrom
<
int
>
(
gpu_tensor
,
*
cpu_
ctx
);
dst_tensor
.
CopyFrom
<
int
>
(
gpu_tensor
,
*
cpu_
place
);
// Compare Slice Tensors
// Compare Slice Tensors
const
int
*
slice_ptr
=
slice_tensor
.
data
<
int
>
();
const
int
*
slice_ptr
=
slice_tensor
.
data
<
int
>
();
...
...
paddle/platform/device_context.cc
浏览文件 @
84b368d2
...
@@ -43,8 +43,19 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
...
@@ -43,8 +43,19 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
CUDADeviceContext
::
CUDADeviceContext
(
GPUPlace
place
)
:
place_
(
place
)
{
CUDADeviceContext
::
CUDADeviceContext
(
GPUPlace
place
)
:
place_
(
place
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
// TODO (qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
eigen_stream_
.
reset
(
new
Eigen
::
CudaStreamDevice
(
&
stream_
));
// here will cause segment fault. We must implement a class derived from
// Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
// later. Please refer to the implementation of class EigenCudaStreamDevice
// in TensorFlow.
//
// We find that CUDA 7 introduces a new option, the per-thread default stream,
// that has two effects. Please refer to https://devblogs.nvidia.com/
// parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
//
// So, we decide to use default stream and add –default-stream per-thread nvcc
// flag. Than, two threads with two CUDADeviceContexts will run parallelly.
eigen_stream_
.
reset
(
new
Eigen
::
CudaStreamDevice
());
eigen_device_
.
reset
(
new
Eigen
::
GpuDevice
(
eigen_stream_
.
get
()));
eigen_device_
.
reset
(
new
Eigen
::
GpuDevice
(
eigen_stream_
.
get
()));
}
}
...
@@ -64,15 +75,12 @@ CUDADeviceContext::~CUDADeviceContext() {
...
@@ -64,15 +75,12 @@ CUDADeviceContext::~CUDADeviceContext() {
}
}
eigen_stream_
.
reset
();
eigen_stream_
.
reset
();
eigen_device_
.
reset
();
eigen_device_
.
reset
();
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream_
));
}
}
Place
CUDADeviceContext
::
GetPlace
()
const
{
return
place_
;
}
Place
CUDADeviceContext
::
GetPlace
()
const
{
return
place_
;
}
cudaStream_t
CUDADeviceContext
::
stream
()
const
{
return
stream_
;
}
void
CUDADeviceContext
::
Wait
()
const
{
void
CUDADeviceContext
::
Wait
()
const
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
0
));
}
}
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
...
@@ -83,7 +91,6 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
...
@@ -83,7 +91,6 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
if
(
!
cublas_handle_
)
{
if
(
!
cublas_handle_
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
PADDLE_ENFORCE
(
dynload
::
cublasCreate
(
&
cublas_handle_
));
PADDLE_ENFORCE
(
dynload
::
cublasCreate
(
&
cublas_handle_
));
PADDLE_ENFORCE
(
dynload
::
cublasSetStream
(
cublas_handle_
,
stream_
));
}
}
return
cublas_handle_
;
return
cublas_handle_
;
}
}
...
@@ -92,7 +99,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
...
@@ -92,7 +99,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
if
(
!
cudnn_handle_
)
{
if
(
!
cudnn_handle_
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
PADDLE_ENFORCE
(
dynload
::
cudnnCreate
(
&
cudnn_handle_
));
PADDLE_ENFORCE
(
dynload
::
cudnnCreate
(
&
cudnn_handle_
));
PADDLE_ENFORCE
(
dynload
::
cudnnSetStream
(
cudnn_handle_
,
stream_
));
}
}
return
cudnn_handle_
;
return
cudnn_handle_
;
}
}
...
@@ -104,7 +110,6 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
...
@@ -104,7 +110,6 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
CURAND_RNG_PSEUDO_DEFAULT
));
CURAND_RNG_PSEUDO_DEFAULT
));
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
dynload
::
curandSetPseudoRandomGeneratorSeed
(
curand_generator_
,
seed_
));
dynload
::
curandSetPseudoRandomGeneratorSeed
(
curand_generator_
,
seed_
));
PADDLE_ENFORCE
(
dynload
::
curandSetStream
(
curand_generator_
,
stream_
));
}
}
return
curand_generator_
;
return
curand_generator_
;
}
}
...
...
paddle/platform/device_context.h
浏览文件 @
84b368d2
...
@@ -61,9 +61,6 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -61,9 +61,6 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
/*! \brief Wait for all operations completion in the stream. */
void
Wait
()
const
;
void
Wait
()
const
;
/*! \brief Return CUDA stream in the device context. */
cudaStream_t
stream
()
const
;
/*! \brief Return place in the device context. */
/*! \brief Return place in the device context. */
Place
GetPlace
()
const
override
;
Place
GetPlace
()
const
override
;
...
@@ -91,8 +88,6 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -91,8 +88,6 @@ class CUDADeviceContext : public DeviceContext {
private:
private:
uint64_t
seed_
;
uint64_t
seed_
;
cudaStream_t
stream_
;
// clang-format off
// clang-format off
cudnnHandle_t
cudnn_handle_
=
nullptr
;
cudnnHandle_t
cudnn_handle_
=
nullptr
;
cublasHandle_t
cublas_handle_
=
nullptr
;
cublasHandle_t
cublas_handle_
=
nullptr
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录