Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
131ec276
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
131ec276
编写于
3月 05, 2018
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug for big number; float->double and code refine
上级
82bd82c1
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
36 addition
and
17 deletion
+36
-17
paddle/fluid/operators/math/concat.cu
paddle/fluid/operators/math/concat.cu
+24
-17
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+6
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+6
-0
未找到文件。
paddle/fluid/operators/math/concat.cu
浏览文件 @
131ec276
...
@@ -70,7 +70,7 @@ __global__ void KernelConcat(T** inputs, const int input_col,
...
@@ -70,7 +70,7 @@ __global__ void KernelConcat(T** inputs, const int input_col,
const
int
output_rows
,
const
int
output_cols
,
const
int
output_rows
,
const
int
output_cols
,
T
*
output
)
{
T
*
output
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
float
inv_input_col
=
1.0
/
input_col
;
double
inv_input_col
=
1.0
/
input_col
;
for
(;
tid_x
<
output_cols
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(;
tid_x
<
output_cols
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
*
inv_input_col
;
int
split
=
tid_x
*
inv_input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
...
@@ -113,7 +113,7 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
...
@@ -113,7 +113,7 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
const
int
input_col
,
const
int
output_cols
,
const
int
input_col
,
const
int
output_cols
,
T
**
outputs
)
{
T
**
outputs
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
float
inv_input_col
=
1.0
/
input_col
;
double
inv_input_col
=
1.0
/
input_col
;
for
(;
tid_x
<
input_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(;
tid_x
<
input_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
*
inv_input_col
;
int
split
=
tid_x
*
inv_input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
...
@@ -145,8 +145,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
...
@@ -145,8 +145,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int
cols
=
input
[
0
].
numel
()
/
rows
;
int
cols
=
input
[
0
].
numel
()
/
rows
;
int
out_rows
=
rows
,
out_cols
=
0
;
int
out_rows
=
rows
,
out_cols
=
0
;
paddle
::
framework
::
Vector
<
int16_t
>
inputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
framework
::
Vector
<
int16_t
>
inputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
paddle
::
framework
::
Vector
<
int
>
inputs_cols
(
num
+
1
);
framework
::
Vector
<
int
>
inputs_cols
(
num
+
1
);
inputs_cols
[
0
]
=
0
;
inputs_cols
[
0
]
=
0
;
T
**
inputs_ptr
=
reinterpret_cast
<
T
**>
(
inputs_data
.
data
());
T
**
inputs_ptr
=
reinterpret_cast
<
T
**>
(
inputs_data
.
data
());
...
@@ -168,15 +168,14 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
...
@@ -168,15 +168,14 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
// computation
// computation
// set the thread block and grid according to CurrentDeviceId
// set the thread block and grid according to CurrentDeviceId
const
int
kThreadsPerBlock
=
1024
;
const
int
kThreadsPerBlock
=
1024
;
int
block_cols
=
std
::
min
(
out_cols
,
kThreadsPerBlock
);
int
block_cols
=
kThreadsPerBlock
;
int
block_rows
=
std
::
max
(
kThreadsPerBlock
/
block_cols
,
1
);
if
(
out_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
out_cols
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
dev_id
=
paddle
::
platform
::
GetCurrentDeviceId
();
int
max_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
multi_process
=
paddle
::
platform
::
GetCUDAMultiProcessors
(
dev_id
);
int
max_threads_per_mp
=
paddle
::
platform
::
GetCUDAMaxThreadsPerMultiProcessor
(
dev_id
);
int
max_threads
=
multi_process
*
max_threads_per_mp
;
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
grid_cols
=
int
grid_cols
=
...
@@ -218,8 +217,8 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -218,8 +217,8 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
int
input_col
=
0
;
int
input_col
=
0
;
bool
sameShape
=
true
;
bool
sameShape
=
true
;
paddle
::
framework
::
Vector
<
int16_t
>
outputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
framework
::
Vector
<
int16_t
>
outputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
paddle
::
framework
::
Vector
<
int
>
outputs_cols
(
num
+
1
);
framework
::
Vector
<
int
>
outputs_cols
(
num
+
1
);
outputs_cols
[
0
]
=
0
;
outputs_cols
[
0
]
=
0
;
T
**
outputs_ptr
=
reinterpret_cast
<
T
**>
(
outputs_data
.
data
());
T
**
outputs_ptr
=
reinterpret_cast
<
T
**>
(
outputs_data
.
data
());
...
@@ -239,12 +238,20 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -239,12 +238,20 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
// computation
// computation
const
int
kThreadsPerBlock
=
1024
;
const
int
kThreadsPerBlock
=
1024
;
int
block_cols
=
std
::
min
(
input_col
,
kThreadsPerBlock
);
int
block_cols
=
kThreadsPerBlock
;
int
block_rows
=
std
::
max
(
kThreadsPerBlock
/
block_cols
,
1
);
if
(
input_col
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
input_col
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_cols
=
(
input_col
+
block_cols
-
1
)
/
block_cols
;
int
max_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
grid_rows
=
(
input_row
+
block_rows
-
1
)
/
block_rows
;
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
grid_cols
=
std
::
min
((
input_col
+
block_cols
-
1
)
/
block_cols
,
max_blocks
);
int
grid_rows
=
std
::
min
(
max_blocks
/
grid_cols
,
std
::
max
(
input_row
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
sameShape
)
{
if
(
sameShape
)
{
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
131ec276
...
@@ -121,6 +121,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
...
@@ -121,6 +121,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
multi_process
=
GetCUDAMultiProcessors
(
place_
.
device
);
max_threads_per_mp
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
...
@@ -154,6 +156,10 @@ void CUDADeviceContext::Wait() const {
...
@@ -154,6 +156,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE
(
cudaGetLastError
());
PADDLE_ENFORCE
(
cudaGetLastError
());
}
}
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
return
multi_process
*
max_threads_per_mp
;
}
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
return
eigen_device_
.
get
();
return
eigen_device_
.
get
();
}
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
131ec276
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */
/*! \brief Return place in the device context. */
Place
GetPlace
()
const
override
;
Place
GetPlace
()
const
override
;
/*! \brief Return the max physical thread count in the device context */
int
GetMaxPhysicalThreadCount
()
const
;
/*! \brief Return eigen device in the device context. */
/*! \brief Return eigen device in the device context. */
Eigen
::
GpuDevice
*
eigen_device
()
const
;
Eigen
::
GpuDevice
*
eigen_device
()
const
;
...
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t
stream_
;
cudaStream_t
stream_
;
cudnnHandle_t
cudnn_handle_
;
cudnnHandle_t
cudnn_handle_
;
cublasHandle_t
cublas_handle_
;
cublasHandle_t
cublas_handle_
;
int
multi_process
;
int
max_threads_per_mp
;
};
};
template
<
>
template
<
>
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录