Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
556d5097
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
556d5097
编写于
1月 14, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
1月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor impl of elementwise op part2 (#38898)
上级
7f8d5bc8
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
919 addition
and
614 deletion
+919
-614
paddle/fluid/operators/elementwise/elementwise_op_function.h
paddle/fluid/operators/elementwise/elementwise_op_function.h
+7
-614
paddle/pten/kernels/cpu/elementwise.h
paddle/pten/kernels/cpu/elementwise.h
+144
-0
paddle/pten/kernels/gpu/elementwise.h
paddle/pten/kernels/gpu/elementwise.h
+768
-0
未找到文件。
paddle/fluid/operators/elementwise/elementwise_op_function.h
浏览文件 @
556d5097
...
...
@@ -49,12 +49,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
#define GetDivMod(dividend, divisor, div, mod) \
do { \
const auto dividend_copy = dividend; \
*div = dividend_copy / divisor; \
*mod = dividend_copy % divisor; \
} while (0)
#define DIVUP(x, y) (((x) + (y)-1) / (y))
...
...
@@ -138,613 +132,11 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
axis
);
}
template
<
typename
Functor
,
typename
T
,
typename
OutType
=
T
>
void
CommonForwardBroadcastCPU
(
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
,
int
*
x_dims_array
,
int
*
y_dims_array
,
int
*
out_dims_array
,
int
max_dim
,
const
platform
::
CPUDeviceContext
&
ctx
,
Functor
func
,
const
bool
is_xsize_larger
=
true
)
{
pten
::
CommonForwardBroadcastCPU
(
x
,
y
,
z
,
x_dims_array
,
y_dims_array
,
out_dims_array
,
max_dim
,
ctx
,
func
,
is_xsize_larger
);
}
#if defined(__NVCC__) || defined(__HIPCC__)
template
<
typename
T
,
typename
DX_OP
,
typename
Tout
=
T
>
__global__
void
CommonGradBroadcastCUDAKernel
(
const
int
*
x_strides_array
,
const
int
*
y_strides_array
,
const
int
*
out_dims_array
,
const
int
*
y_strides_order
,
const
int
*
y_dims_order
,
const
T
*
x
,
const
T
*
y
,
const
Tout
*
out
,
const
Tout
*
dout
,
T
*
dx
,
int
out_size
,
int
max_dim
,
int
thread_num
,
DX_OP
dx_op
)
{
T
val
(
0
);
int
i
=
blockIdx
.
x
;
int
tid
=
threadIdx
.
x
;
for
(
int
j
=
tid
;
j
<
thread_num
;
j
+=
blockDim
.
x
)
{
const
int
X_index
=
i
*
thread_num
+
j
;
int
out_index
=
X_index
;
int
C_index
=
0
;
int
B_index
=
i
*
thread_num
+
j
;
int
remainder
=
0
;
#pragma unroll
for
(
int
d
=
max_dim
-
1
;
d
>=
0
;
--
d
)
{
GetDivMod
(
B_index
,
y_dims_order
[
d
],
&
B_index
,
&
remainder
);
C_index
+=
remainder
*
y_strides_order
[
d
];
}
int
x_index
=
0
;
int
y_index
=
0
;
int
C_index_val
=
C_index
;
#pragma unroll
for
(
int
d
=
max_dim
-
1
;
d
>=
0
;
--
d
)
{
GetDivMod
(
C_index_val
,
out_dims_array
[
d
],
&
C_index_val
,
&
remainder
);
x_index
+=
remainder
*
x_strides_array
[
d
];
y_index
+=
remainder
*
y_strides_array
[
d
];
}
out_index
=
C_index
;
val
+=
dx_op
(
x
[
x_index
],
y
[
y_index
],
out
[
out_index
],
dout
[
out_index
]);
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
thread_num
);
if
(
threadIdx
.
x
==
0
)
{
dx
[
i
]
=
val
;
}
}
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
CommonGradBroadcastCUDA
(
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
int
*
x_dims_array
,
int
*
y_dims_array
,
int
*
out_dims_array
,
int
max_dim
,
const
platform
::
CUDADeviceContext
&
ctx
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
auto
gplace
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
());
auto
cplace
=
platform
::
CPUPlace
();
const
T
*
x_data
=
x
.
data
<
T
>
();
const
T
*
y_data
=
y
.
data
<
T
>
();
const
Tout
*
out_data
=
out
.
data
<
Tout
>
();
const
Tout
*
dout_data
=
dout
.
data
<
Tout
>
();
T
*
dx_data
=
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
dy_data
=
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
x_one_indexs
;
std
::
vector
<
int
>
y_one_indexs
;
for
(
int
i
=
0
;
i
<
max_dim
;
i
++
)
{
if
(
x_dims_array
[
i
]
!=
y_dims_array
[
i
])
{
if
(
x_dims_array
[
i
]
==
1
)
{
x_one_indexs
.
push_back
(
i
);
}
if
(
y_dims_array
[
i
]
==
1
)
{
y_one_indexs
.
push_back
(
i
);
}
}
}
std
::
vector
<
int
>
x_trans_indexs
(
max_dim
);
std
::
vector
<
int
>
y_trans_indexs
(
max_dim
);
pten
::
ComputeBroadcastTranspositionArray
(
x_one_indexs
.
data
(),
x_trans_indexs
.
data
(),
max_dim
,
x_one_indexs
.
size
());
pten
::
ComputeBroadcastTranspositionArray
(
y_one_indexs
.
data
(),
y_trans_indexs
.
data
(),
max_dim
,
y_one_indexs
.
size
());
// compute array stride for cuda kernel;
// e.g. x.dims=[2,3,4], x_stride=[12,4,1]
std
::
vector
<
int
>
x_strides_array
(
max_dim
);
std
::
vector
<
int
>
y_strides_array
(
max_dim
);
std
::
vector
<
int
>
out_strides_array
(
max_dim
);
int
x_stride
=
1
;
int
y_stride
=
1
;
int
z_stride
=
1
;
for
(
int
i
=
max_dim
-
1
;
i
>=
0
;
i
--
)
{
x_strides_array
[
i
]
=
x_dims_array
[
i
]
==
1
?
0
:
x_stride
;
y_strides_array
[
i
]
=
y_dims_array
[
i
]
==
1
?
0
:
y_stride
;
out_strides_array
[
i
]
=
z_stride
;
x_stride
*=
x_dims_array
[
i
];
y_stride
*=
y_dims_array
[
i
];
z_stride
*=
out_dims_array
[
i
];
}
std
::
vector
<
int
>
x_strides_order
(
max_dim
);
std
::
vector
<
int
>
y_strides_order
(
max_dim
);
std
::
vector
<
int
>
x_dims_order
(
max_dim
);
std
::
vector
<
int
>
y_dims_order
(
max_dim
);
for
(
int
i
=
0
;
i
<
max_dim
;
++
i
)
{
x_strides_order
[
i
]
=
out_strides_array
[
x_trans_indexs
[
i
]];
y_strides_order
[
i
]
=
out_strides_array
[
y_trans_indexs
[
i
]];
x_dims_order
[
i
]
=
out_dims_array
[
x_trans_indexs
[
i
]];
y_dims_order
[
i
]
=
out_dims_array
[
y_trans_indexs
[
i
]];
}
std
::
vector
<
int
>
x_broadcast_pos
;
std
::
vector
<
int
>
y_broadcast_pos
;
int
bytes
=
max_dim
*
sizeof
(
int
);
for
(
int
i
=
0
;
i
<
max_dim
;
++
i
)
{
if
(
x_dims_array
[
i
]
!=
out_dims_array
[
i
]
&&
x_dims_array
[
i
]
==
1
)
{
x_broadcast_pos
.
emplace_back
(
i
);
}
if
(
y_dims_array
[
i
]
!=
out_dims_array
[
i
]
&&
y_dims_array
[
i
]
==
1
)
{
y_broadcast_pos
.
emplace_back
(
i
);
}
}
auto
stream
=
ctx
.
stream
();
bool
can_split_x
=
false
;
bool
can_split_y
=
false
;
auto
FastCommonCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
bool
is_y
)
{
int
h
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
int
w
=
std
::
accumulate
(
out_dims_array
+
broadcast_pos
.
size
(),
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
VLOG
(
3
)
<<
"FastCommonCUDAF elementwise w:"
<<
w
<<
" h:"
<<
h
<<
" is_y:"
<<
is_y
;
int
split_h
;
int
split_w
;
int
kh
=
h
;
int
kw
=
w
;
if
(
is_y
)
{
split_h
=
std
::
accumulate
(
x_dims_array
,
x_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
split_w
=
std
::
accumulate
(
x_dims_array
+
broadcast_pos
.
size
(),
x_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
else
{
split_h
=
std
::
accumulate
(
y_dims_array
,
y_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
split_w
=
std
::
accumulate
(
y_dims_array
+
broadcast_pos
.
size
(),
y_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
if
(
h
>
split_h
)
kh
=
split_h
;
if
(
w
>
split_w
)
kw
=
split_w
;
if
(
is_y
)
{
if
(
w
<
16
||
h
<
16
)
{
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
h
);
int
grid_size
=
w
;
pten
::
CommonGradBroadcast1CUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dy_op
,
dy_data
,
kh
,
kw
,
is_y
);
}
else
{
dim3
block_size
=
dim3
(
BLOCK_X
,
BLOCK_Y
);
int
grid_size
=
(
w
+
BLOCK_X
-
1
)
/
BLOCK_X
;
pten
::
FastCommonGradBroadcastCUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dy_op
,
dy_data
,
kh
,
kw
,
is_y
);
}
}
else
{
if
(
w
<
16
||
h
<
16
)
{
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
h
);
int
grid_size
=
w
;
pten
::
CommonGradBroadcast1CUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dx_op
,
dx_data
,
kh
,
kw
,
is_y
);
}
else
{
dim3
block_size
=
dim3
(
BLOCK_X
,
BLOCK_Y
);
int
grid_size
=
(
w
+
BLOCK_X
-
1
)
/
BLOCK_X
;
pten
::
FastCommonGradBroadcastCUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dx_op
,
dx_data
,
kh
,
kw
,
is_y
);
}
}
};
auto
FastBroadCastHeightCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
bool
x_large
)
{
int
h
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
int
w
=
std
::
accumulate
(
out_dims_array
+
broadcast_pos
.
size
(),
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
VLOG
(
3
)
<<
"FastBroadCastHeightCUDAF w:"
<<
w
<<
" h:"
<<
h
;
if
(
w
<
16
||
h
<
16
)
{
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
h
);
int
grid_size
=
w
;
pten
::
ElemwiseGradBroadcast1CUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
x_large
,
dx_op
,
dy_op
,
dx_data
,
dy_data
);
}
else
{
dim3
block_size
=
dim3
(
BLOCK_X
,
BLOCK_Y
);
int
grid_size
=
(
w
+
BLOCK_X
-
1
)
/
BLOCK_X
;
pten
::
FastElemwiseGradBroadcast1CUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
x_large
,
dx_op
,
dy_op
,
dx_data
,
dy_data
);
}
};
auto
FastBroadCastAllCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
int
max_dim
,
bool
is_x_large
)
{
int
axis
=
broadcast_pos
[
0
];
int
pre
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
int
mid
=
1
;
int
post
=
1
;
if
(
broadcast_pos
.
size
()
==
1
)
{
mid
=
out_dims_array
[
axis
];
post
=
std
::
accumulate
(
out_dims_array
+
axis
+
1
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
else
{
mid
=
std
::
accumulate
(
out_dims_array
+
axis
,
out_dims_array
+
broadcast_pos
.
back
()
+
1
,
1
,
std
::
multiplies
<
int
>
());
post
=
std
::
accumulate
(
out_dims_array
+
broadcast_pos
.
back
()
+
1
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
VLOG
(
3
)
<<
"FastBroadCastAllCUDAF pre:"
<<
pre
<<
" mid:"
<<
mid
<<
" post:"
<<
post
;
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
mid
);
int
grid_size
=
pre
*
post
;
pten
::
FastCommonGradBroadcastAllCUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
pre
,
mid
,
post
,
is_x_large
,
dx_op
,
dy_op
,
dx_data
,
dy_data
);
};
auto
FastBroadCastOneCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
int
max_dim
,
bool
is_x
)
{
int
axis
=
broadcast_pos
[
0
];
int
pre
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
int
mid
=
out_dims_array
[
axis
];
int
post
=
std
::
accumulate
(
out_dims_array
+
axis
+
1
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
k_pre
;
int
k_mid
;
int
k_post
;
if
(
is_x
)
{
k_pre
=
std
::
accumulate
(
y_dims_array
,
y_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
k_mid
=
y_dims_array
[
axis
];
k_post
=
std
::
accumulate
(
y_dims_array
+
axis
+
1
,
y_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
mid
);
int
grid_size
=
pre
*
post
;
// we need to calc y offset with blockid, so do x_pre/y_pre to get left
// size.
if
(
k_pre
!=
pre
)
k_pre
=
pre
/
k_pre
;
pten
::
FastCommonGradBroadcastOneCUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
pre
,
mid
,
post
,
k_pre
,
k_mid
,
k_post
,
true
,
dx_op
,
dx_data
);
}
else
{
k_pre
=
std
::
accumulate
(
x_dims_array
,
x_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
k_mid
=
x_dims_array
[
axis
];
k_post
=
std
::
accumulate
(
x_dims_array
+
axis
+
1
,
x_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
mid
);
int
grid_size
=
pre
*
post
;
if
(
k_pre
!=
pre
)
k_pre
=
pre
/
k_pre
;
pten
::
FastCommonGradBroadcastOneCUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
pre
,
mid
,
post
,
k_pre
,
k_mid
,
k_post
,
false
,
dy_op
,
dy_data
);
}
VLOG
(
3
)
<<
"FastBroadCastOneCUDAF pre:"
<<
pre
<<
" mid:"
<<
mid
<<
" post:"
<<
post
;
};
// do fast elementwise if: 1. only one input need to do broadcast, we can
// fallback
// to old fast path.
// 2. if both x and y need broadcast, then do it one by one.
bool
fast_broadcast
=
false
;
if
(
x_broadcast_pos
.
empty
()
&&
!
y_broadcast_pos
.
empty
())
{
can_split_y
=
pten
::
SplitDims
(
y_broadcast_pos
,
max_dim
);
if
(
can_split_y
)
{
// only y need to do broadcast on h
if
(
y_broadcast_pos
[
0
]
==
0
)
{
FastBroadCastHeightCUDAF
(
y_broadcast_pos
,
true
);
fast_broadcast
=
true
;
}
}
else
if
(
y_broadcast_pos
.
size
()
==
1
||
pten
::
CheckContiguousDims
(
y_broadcast_pos
))
{
// for only one dim and
// contiguous broadcast.
// If cannot split, which means input has 3 parts
FastBroadCastAllCUDAF
(
y_broadcast_pos
,
max_dim
,
true
);
fast_broadcast
=
true
;
}
}
else
if
(
y_broadcast_pos
.
empty
()
&&
!
x_broadcast_pos
.
empty
())
{
// only x need broadcast
can_split_x
=
pten
::
SplitDims
(
x_broadcast_pos
,
max_dim
);
if
(
can_split_x
)
{
if
(
x_broadcast_pos
[
0
]
==
0
)
{
FastBroadCastHeightCUDAF
(
x_broadcast_pos
,
false
);
fast_broadcast
=
true
;
}
}
else
if
(
x_broadcast_pos
.
size
()
==
1
||
pten
::
CheckContiguousDims
(
x_broadcast_pos
))
{
FastBroadCastAllCUDAF
(
x_broadcast_pos
,
max_dim
,
false
);
fast_broadcast
=
true
;
}
}
else
if
(
!
x_broadcast_pos
.
empty
()
&&
!
y_broadcast_pos
.
empty
())
{
// do x and y broadcast each.
can_split_y
=
pten
::
SplitDims
(
y_broadcast_pos
,
max_dim
);
bool
fast_broadcast_x
=
false
;
bool
fast_broadcast_y
=
false
;
if
(
can_split_y
)
{
// begin at start.
if
(
y_broadcast_pos
[
0
]
==
0
)
{
FastCommonCUDAF
(
y_broadcast_pos
,
true
);
fast_broadcast_y
=
true
;
}
}
else
if
(
y_broadcast_pos
.
size
()
==
1
)
{
FastBroadCastOneCUDAF
(
y_broadcast_pos
,
max_dim
,
false
);
can_split_y
=
true
;
fast_broadcast_y
=
true
;
}
can_split_x
=
pten
::
SplitDims
(
x_broadcast_pos
,
max_dim
);
if
(
can_split_x
)
{
if
(
x_broadcast_pos
[
0
]
==
0
)
{
FastCommonCUDAF
(
x_broadcast_pos
,
false
);
fast_broadcast_x
=
true
;
}
}
else
if
(
x_broadcast_pos
.
size
()
==
1
)
{
FastBroadCastOneCUDAF
(
x_broadcast_pos
,
max_dim
,
true
);
can_split_x
=
true
;
fast_broadcast_x
=
true
;
}
VLOG
(
3
)
<<
"CommonBroadcast can_split_y:"
<<
can_split_y
<<
" can_split_x:"
<<
can_split_x
;
// if both x and y into fast path then return
if
(
fast_broadcast_x
&&
fast_broadcast_y
)
{
fast_broadcast
=
true
;
}
if
(
can_split_y
&&
can_split_x
&&
fast_broadcast
)
return
;
}
// Should remove memory copy, use reg instead.
if
(
fast_broadcast
)
{
return
;
}
int
x_blocks
=
0
;
int
x_threads
=
0
;
pten
::
ComputeBroadcastKernelSize
(
x_dims_array
,
out_dims_array
,
&
x_blocks
,
&
x_threads
,
max_dim
);
int
y_blocks
=
0
;
int
y_threads
=
0
;
pten
::
ComputeBroadcastKernelSize
(
y_dims_array
,
out_dims_array
,
&
y_blocks
,
&
y_threads
,
max_dim
);
auto
x_strides_array_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
x_strides_array_gpu
=
reinterpret_cast
<
int
*>
(
x_strides_array_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
x_strides_array_gpu
,
cplace
,
x_strides_array
.
data
(),
bytes
,
ctx
.
stream
());
auto
y_strides_array_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
y_strides_array_gpu
=
reinterpret_cast
<
int
*>
(
y_strides_array_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
y_strides_array_gpu
,
cplace
,
y_strides_array
.
data
(),
bytes
,
ctx
.
stream
());
auto
out_dims_array_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
out_dims_array_gpu
=
reinterpret_cast
<
int
*>
(
out_dims_array_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
out_dims_array_gpu
,
cplace
,
out_dims_array
,
bytes
,
ctx
.
stream
());
const
int
out_size
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
x_block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
x_threads
);
int
y_block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
y_threads
);
if
(
dx
)
{
auto
x_strides_order_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
x_strides_order_gpu
=
reinterpret_cast
<
int
*>
(
x_strides_order_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
x_strides_order_gpu
,
cplace
,
x_strides_order
.
data
(),
bytes
,
ctx
.
stream
());
auto
x_dims_order_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
x_dims_order_gpu
=
reinterpret_cast
<
int
*>
(
x_dims_order_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
x_dims_order_gpu
,
cplace
,
x_dims_order
.
data
(),
bytes
,
ctx
.
stream
());
CommonGradBroadcastCUDAKernel
<
T
,
DX_OP
,
Tout
><<<
x_blocks
,
x_block_size
,
0
,
ctx
.
stream
()
>>>
(
x_strides_array_gpu
,
y_strides_array_gpu
,
out_dims_array_gpu
,
x_strides_order_gpu
,
x_dims_order_gpu
,
x_data
,
y_data
,
out_data
,
dout_data
,
dx_data
,
out_size
,
max_dim
,
x_threads
,
dx_op
);
}
if
(
dy
)
{
auto
y_strides_order_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
y_strides_order_gpu
=
reinterpret_cast
<
int
*>
(
y_strides_order_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
y_strides_order_gpu
,
cplace
,
y_strides_order
.
data
(),
bytes
,
ctx
.
stream
());
auto
y_dims_order_tmp
=
memory
::
Alloc
(
ctx
,
bytes
);
int
*
y_dims_order_gpu
=
reinterpret_cast
<
int
*>
(
y_dims_order_tmp
->
ptr
());
memory
::
Copy
(
gplace
,
y_dims_order_gpu
,
cplace
,
y_dims_order
.
data
(),
bytes
,
ctx
.
stream
());
CommonGradBroadcastCUDAKernel
<
T
,
DY_OP
,
Tout
><<<
y_blocks
,
y_block_size
,
0
,
ctx
.
stream
()
>>>
(
x_strides_array_gpu
,
y_strides_array_gpu
,
out_dims_array_gpu
,
y_strides_order_gpu
,
y_dims_order_gpu
,
x_data
,
y_data
,
out_data
,
dout_data
,
dy_data
,
out_size
,
max_dim
,
y_threads
,
dy_op
);
}
}
#endif // __NVCC__ or __HIPCC__
inline
framework
::
DDim
trim_trailing_singular_dims
(
const
framework
::
DDim
&
dims
)
{
return
pten
::
funcs
::
trim_trailing_singular_dims
(
dims
);
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
CommonElementwiseBroadcastBackward
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dims
,
const
framework
::
DDim
&
y_dims
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
// for inplace strategy. memset will make dx and dout clear and get wrong
// result.
if
(
dx
&&
dx
->
IsSharedBufferWith
(
dout
))
{
dx
->
clear
();
dx
->
mutable_data
<
T
>
(
x_dims
,
ctx
.
GetPlace
());
}
VLOG
(
3
)
<<
"CommonElementwiseBroadcastBackward xdims:"
<<
framework
::
make_ddim
(
x_dims_array
)
<<
" ydim:"
<<
framework
::
make_ddim
(
y_dims_array
);
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#if defined(__NVCC__) || defined(__HIPCC__)
CommonGradBroadcastCUDA
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
x
,
y
,
out
,
dout
,
dx
,
dy
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
dx_op
,
dy_op
);
#endif
}
else
{
pten
::
CommonGradBroadcastCPU
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
x
,
y
,
out
,
dout
,
dx
,
dy
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>(),
dx_op
,
dy_op
);
}
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
ElemwiseGradComputeWithBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dims
,
const
framework
::
DDim
&
y_dims
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
bool
is_xsize_larger
=
true
;
int
max_dim
=
x_dims
.
size
();
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
is_xsize_larger
=
false
;
max_dim
=
y_dims
.
size
();
}
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
PADDLE_ENFORCE_GE
(
axis
,
0
,
platform
::
errors
::
InvalidArgument
(
"Axis should be great than or equal to 0, but received axis is %d."
,
axis
));
PADDLE_ENFORCE_LT
(
axis
,
max_dim
,
platform
::
errors
::
InvalidArgument
(
"Axis should be less than %d, but received axis is %d."
,
max_dim
,
axis
));
int
pre
,
n
,
post
,
is_run_common_broadcast
,
axis_trim
=
0
;
if
(
is_xsize_larger
)
{
auto
y_dims_trimed
=
trim_trailing_singular_dims
(
y_dims
);
axis_trim
=
(
y_dims_trimed
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
pten
::
funcs
::
get_mid_dims
(
x_dims
,
y_dims_trimed
,
axis_trim
,
&
pre
,
&
n
,
&
post
,
&
is_run_common_broadcast
);
}
else
{
auto
x_dims_trimed
=
trim_trailing_singular_dims
(
x_dims
);
axis_trim
=
(
x_dims_trimed
.
size
()
==
0
)
?
y_dims
.
size
()
:
axis
;
pten
::
funcs
::
get_mid_dims
(
y_dims
,
x_dims_trimed
,
axis_trim
,
&
pre
,
&
n
,
&
post
,
&
is_run_common_broadcast
);
}
// special case for common backward implementation.
if
(
is_run_common_broadcast
)
{
CommonElementwiseBroadcastBackward
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
,
Tout
>
(
ctx
,
x_dims
,
y_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
return
;
}
if
(
post
==
1
)
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#if defined(__NVCC__) || defined(__HIPCC__)
pten
::
ElemwiseGradBroadcast1CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
pten
::
ElemwiseGradBroadcast1CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
else
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#if defined(__NVCC__) || defined(__HIPCC__)
pten
::
ElemwiseGradBroadcast2CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
post
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
pten
::
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
post
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
template
<
typename
Functor
,
typename
DeviceContext
,
typename
T
,
typename
OutType
=
T
>
void
CommonElementwiseBroadcastForward
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
,
const
framework
::
DDim
&
x_dims
,
const
framework
::
DDim
&
y_dims
,
Functor
func
,
int
axis
,
const
bool
is_xsize_larger
=
true
)
{
z
->
mutable_data
<
OutType
>
(
ctx
.
GetPlace
());
auto
pt_x
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
x
);
auto
pt_y
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
y
);
auto
pt_z
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
z
);
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
pten
::
CommonElementwiseBroadcastForward
(
dev_ctx
,
*
pt_x
.
get
(),
*
pt_y
.
get
(),
pt_z
.
get
(),
x_dims
,
y_dims
,
func
,
axis
,
is_xsize_larger
);
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
ElemwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
...
...
@@ -755,14 +147,14 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
framework
::
DDim
&
x_dim
=
x
.
dims
();
const
framework
::
DDim
&
y_dim
=
y
.
dims
();
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
if
(
x
.
dims
()
==
y
.
dims
())
{
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
pten
::
funcs
::
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
,
Tout
>
(
dev_ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
,
Tout
>
(
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
pten
::
ElemwiseGradComputeWithBroadcast
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
dev_
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
}
...
...
@@ -780,14 +172,15 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
framework
::
DDim
&
x_dim
=
x
.
dims
();
const
framework
::
DDim
&
y_dim
=
y
.
dims
();
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
if
(
x
.
dims
()
==
y
.
dims
())
{
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
pten
::
funcs
::
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
dev_ctx
,
x_dim
,
y_dim
,
dout
,
dout
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
x_dim
,
y_dim
,
dout
,
dout
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
pten
::
ElemwiseGradComputeWithBroadcast
<
T
,
DX_OP
,
DY_OP
>
(
dev_ctx
,
x_dim
,
y_dim
,
dout
,
dout
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
}
...
...
paddle/pten/kernels/cpu/elementwise.h
浏览文件 @
556d5097
...
...
@@ -549,4 +549,148 @@ static void ElemwiseGradBroadcast2CPU(const T* x,
}
}
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
CommonElementwiseBroadcastBackward
(
const
CPUContext
&
ctx
,
const
DDim
&
x_dims
,
const
DDim
&
y_dims
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
&
out
,
const
DenseTensor
&
dout
,
int
axis
,
DenseTensor
*
dx
,
DenseTensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
funcs
::
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
// for inplace strategy. memset will make dx and dout clear and get wrong
// result.
if
(
dx
&&
dx
->
IsSharedBufferWith
(
dout
))
{
dx
->
clear
();
dx
->
mutable_data
<
T
>
(
x_dims
,
ctx
.
GetPlace
());
}
VLOG
(
3
)
<<
"CommonElementwiseBroadcastBackward xdims:"
<<
paddle
::
framework
::
make_ddim
(
x_dims_array
)
<<
" ydim:"
<<
paddle
::
framework
::
make_ddim
(
y_dims_array
);
CommonGradBroadcastCPU
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
x
,
y
,
out
,
dout
,
dx
,
dy
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
ctx
,
dx_op
,
dy_op
);
}
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
ElemwiseGradComputeWithBroadcast
(
const
CPUContext
&
ctx
,
const
DDim
&
x_dims
,
const
DDim
&
y_dims
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
&
out
,
const
DenseTensor
&
dout
,
int
axis
,
DenseTensor
*
dx
,
DenseTensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
bool
is_xsize_larger
=
true
;
int
max_dim
=
x_dims
.
size
();
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
is_xsize_larger
=
false
;
max_dim
=
y_dims
.
size
();
}
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
PADDLE_ENFORCE_GE
(
axis
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"Axis should be great than or equal to 0, but received axis is %d."
,
axis
));
PADDLE_ENFORCE_LT
(
axis
,
max_dim
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"Axis should be less than %d, but received axis is %d."
,
max_dim
,
axis
));
int
pre
,
n
,
post
,
is_run_common_broadcast
,
axis_trim
=
0
;
if
(
is_xsize_larger
)
{
auto
y_dims_trimed
=
funcs
::
trim_trailing_singular_dims
(
y_dims
);
axis_trim
=
(
y_dims_trimed
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
funcs
::
get_mid_dims
(
x_dims
,
y_dims_trimed
,
axis_trim
,
&
pre
,
&
n
,
&
post
,
&
is_run_common_broadcast
);
}
else
{
auto
x_dims_trimed
=
funcs
::
trim_trailing_singular_dims
(
x_dims
);
axis_trim
=
(
x_dims_trimed
.
size
()
==
0
)
?
y_dims
.
size
()
:
axis
;
funcs
::
get_mid_dims
(
y_dims
,
x_dims_trimed
,
axis_trim
,
&
pre
,
&
n
,
&
post
,
&
is_run_common_broadcast
);
}
// special case for common backward implementation.
if
(
is_run_common_broadcast
)
{
CommonElementwiseBroadcastBackward
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
ctx
,
x_dims
,
y_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
return
;
}
if
(
post
==
1
)
{
ElemwiseGradBroadcast1CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
else
{
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
post
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
// namespace pten
paddle/pten/kernels/gpu/elementwise.h
浏览文件 @
556d5097
...
...
@@ -18,7 +18,10 @@ limitations under the License. */
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
#include "paddle/pten/kernels/funcs/elementwise_base.h"
#ifdef __HIPCC__
constexpr
int
ELEMWISE_MAX_BLOCK_DIM
=
256
;
...
...
@@ -28,6 +31,13 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#define BLOCK_X 32
#define BLOCK_Y 32
#define GetDivMod(dividend, divisor, div, mod) \
do { \
const auto dividend_copy = dividend; \
*div = dividend_copy / divisor; \
*mod = dividend_copy % divisor; \
} while (0)
namespace
pten
{
namespace
kps
=
paddle
::
operators
::
kernel_primitives
;
...
...
@@ -1469,4 +1479,762 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
x
,
y
,
out
,
dout
,
pre
,
n
,
post
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
,
dy
);
}
template
<
typename
T
,
typename
DX_OP
,
typename
Tout
=
T
>
__global__
void
CommonGradBroadcastCUDAKernel
(
const
int
*
x_strides_array
,
const
int
*
y_strides_array
,
const
int
*
out_dims_array
,
const
int
*
y_strides_order
,
const
int
*
y_dims_order
,
const
T
*
x
,
const
T
*
y
,
const
Tout
*
out
,
const
Tout
*
dout
,
T
*
dx
,
int
out_size
,
int
max_dim
,
int
thread_num
,
DX_OP
dx_op
)
{
T
val
(
0
);
int
i
=
blockIdx
.
x
;
int
tid
=
threadIdx
.
x
;
for
(
int
j
=
tid
;
j
<
thread_num
;
j
+=
blockDim
.
x
)
{
const
int
X_index
=
i
*
thread_num
+
j
;
int
out_index
=
X_index
;
int
C_index
=
0
;
int
B_index
=
i
*
thread_num
+
j
;
int
remainder
=
0
;
#pragma unroll
for
(
int
d
=
max_dim
-
1
;
d
>=
0
;
--
d
)
{
GetDivMod
(
B_index
,
y_dims_order
[
d
],
&
B_index
,
&
remainder
);
C_index
+=
remainder
*
y_strides_order
[
d
];
}
int
x_index
=
0
;
int
y_index
=
0
;
int
C_index_val
=
C_index
;
#pragma unroll
for
(
int
d
=
max_dim
-
1
;
d
>=
0
;
--
d
)
{
GetDivMod
(
C_index_val
,
out_dims_array
[
d
],
&
C_index_val
,
&
remainder
);
x_index
+=
remainder
*
x_strides_array
[
d
];
y_index
+=
remainder
*
y_strides_array
[
d
];
}
out_index
=
C_index
;
val
+=
dx_op
(
x
[
x_index
],
y
[
y_index
],
out
[
out_index
],
dout
[
out_index
]);
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
thread_num
);
if
(
threadIdx
.
x
==
0
)
{
dx
[
i
]
=
val
;
}
}
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
CommonGradBroadcastCUDA
(
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
&
out
,
const
DenseTensor
&
dout
,
DenseTensor
*
dx
,
DenseTensor
*
dy
,
int
*
x_dims_array
,
int
*
y_dims_array
,
int
*
out_dims_array
,
int
max_dim
,
const
GPUContext
&
ctx
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
auto
gplace
=
BOOST_GET_CONST
(
paddle
::
platform
::
CUDAPlace
,
ctx
.
GetPlace
());
auto
cplace
=
paddle
::
platform
::
CPUPlace
();
const
T
*
x_data
=
x
.
data
<
T
>
();
const
T
*
y_data
=
y
.
data
<
T
>
();
const
Tout
*
out_data
=
out
.
data
<
Tout
>
();
const
Tout
*
dout_data
=
dout
.
data
<
Tout
>
();
T
*
dx_data
=
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
dy_data
=
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
x_one_indexs
;
std
::
vector
<
int
>
y_one_indexs
;
for
(
int
i
=
0
;
i
<
max_dim
;
i
++
)
{
if
(
x_dims_array
[
i
]
!=
y_dims_array
[
i
])
{
if
(
x_dims_array
[
i
]
==
1
)
{
x_one_indexs
.
push_back
(
i
);
}
if
(
y_dims_array
[
i
]
==
1
)
{
y_one_indexs
.
push_back
(
i
);
}
}
}
std
::
vector
<
int
>
x_trans_indexs
(
max_dim
);
std
::
vector
<
int
>
y_trans_indexs
(
max_dim
);
ComputeBroadcastTranspositionArray
(
x_one_indexs
.
data
(),
x_trans_indexs
.
data
(),
max_dim
,
x_one_indexs
.
size
());
ComputeBroadcastTranspositionArray
(
y_one_indexs
.
data
(),
y_trans_indexs
.
data
(),
max_dim
,
y_one_indexs
.
size
());
// compute array stride for cuda kernel;
// e.g. x.dims=[2,3,4], x_stride=[12,4,1]
std
::
vector
<
int
>
x_strides_array
(
max_dim
);
std
::
vector
<
int
>
y_strides_array
(
max_dim
);
std
::
vector
<
int
>
out_strides_array
(
max_dim
);
int
x_stride
=
1
;
int
y_stride
=
1
;
int
z_stride
=
1
;
for
(
int
i
=
max_dim
-
1
;
i
>=
0
;
i
--
)
{
x_strides_array
[
i
]
=
x_dims_array
[
i
]
==
1
?
0
:
x_stride
;
y_strides_array
[
i
]
=
y_dims_array
[
i
]
==
1
?
0
:
y_stride
;
out_strides_array
[
i
]
=
z_stride
;
x_stride
*=
x_dims_array
[
i
];
y_stride
*=
y_dims_array
[
i
];
z_stride
*=
out_dims_array
[
i
];
}
std
::
vector
<
int
>
x_strides_order
(
max_dim
);
std
::
vector
<
int
>
y_strides_order
(
max_dim
);
std
::
vector
<
int
>
x_dims_order
(
max_dim
);
std
::
vector
<
int
>
y_dims_order
(
max_dim
);
for
(
int
i
=
0
;
i
<
max_dim
;
++
i
)
{
x_strides_order
[
i
]
=
out_strides_array
[
x_trans_indexs
[
i
]];
y_strides_order
[
i
]
=
out_strides_array
[
y_trans_indexs
[
i
]];
x_dims_order
[
i
]
=
out_dims_array
[
x_trans_indexs
[
i
]];
y_dims_order
[
i
]
=
out_dims_array
[
y_trans_indexs
[
i
]];
}
std
::
vector
<
int
>
x_broadcast_pos
;
std
::
vector
<
int
>
y_broadcast_pos
;
int
bytes
=
max_dim
*
sizeof
(
int
);
for
(
int
i
=
0
;
i
<
max_dim
;
++
i
)
{
if
(
x_dims_array
[
i
]
!=
out_dims_array
[
i
]
&&
x_dims_array
[
i
]
==
1
)
{
x_broadcast_pos
.
emplace_back
(
i
);
}
if
(
y_dims_array
[
i
]
!=
out_dims_array
[
i
]
&&
y_dims_array
[
i
]
==
1
)
{
y_broadcast_pos
.
emplace_back
(
i
);
}
}
auto
stream
=
ctx
.
stream
();
bool
can_split_x
=
false
;
bool
can_split_y
=
false
;
auto
FastCommonCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
bool
is_y
)
{
int
h
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
int
w
=
std
::
accumulate
(
out_dims_array
+
broadcast_pos
.
size
(),
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
VLOG
(
3
)
<<
"FastCommonCUDAF elementwise w:"
<<
w
<<
" h:"
<<
h
<<
" is_y:"
<<
is_y
;
int
split_h
;
int
split_w
;
int
kh
=
h
;
int
kw
=
w
;
if
(
is_y
)
{
split_h
=
std
::
accumulate
(
x_dims_array
,
x_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
split_w
=
std
::
accumulate
(
x_dims_array
+
broadcast_pos
.
size
(),
x_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
else
{
split_h
=
std
::
accumulate
(
y_dims_array
,
y_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
split_w
=
std
::
accumulate
(
y_dims_array
+
broadcast_pos
.
size
(),
y_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
if
(
h
>
split_h
)
kh
=
split_h
;
if
(
w
>
split_w
)
kw
=
split_w
;
if
(
is_y
)
{
if
(
w
<
16
||
h
<
16
)
{
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
h
);
int
grid_size
=
w
;
CommonGradBroadcast1CUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dy_op
,
dy_data
,
kh
,
kw
,
is_y
);
}
else
{
dim3
block_size
=
dim3
(
BLOCK_X
,
BLOCK_Y
);
int
grid_size
=
(
w
+
BLOCK_X
-
1
)
/
BLOCK_X
;
FastCommonGradBroadcastCUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dy_op
,
dy_data
,
kh
,
kw
,
is_y
);
}
}
else
{
if
(
w
<
16
||
h
<
16
)
{
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
h
);
int
grid_size
=
w
;
CommonGradBroadcast1CUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dx_op
,
dx_data
,
kh
,
kw
,
is_y
);
}
else
{
dim3
block_size
=
dim3
(
BLOCK_X
,
BLOCK_Y
);
int
grid_size
=
(
w
+
BLOCK_X
-
1
)
/
BLOCK_X
;
FastCommonGradBroadcastCUDAKernelHeight
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
dx_op
,
dx_data
,
kh
,
kw
,
is_y
);
}
}
};
auto
FastBroadCastHeightCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
bool
x_large
)
{
int
h
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
broadcast_pos
.
size
(),
1
,
std
::
multiplies
<
int
>
());
int
w
=
std
::
accumulate
(
out_dims_array
+
broadcast_pos
.
size
(),
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
VLOG
(
3
)
<<
"FastBroadCastHeightCUDAF w:"
<<
w
<<
" h:"
<<
h
;
if
(
w
<
16
||
h
<
16
)
{
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
h
);
int
grid_size
=
w
;
ElemwiseGradBroadcast1CUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
x_large
,
dx_op
,
dy_op
,
dx_data
,
dy_data
);
}
else
{
dim3
block_size
=
dim3
(
BLOCK_X
,
BLOCK_Y
);
int
grid_size
=
(
w
+
BLOCK_X
-
1
)
/
BLOCK_X
;
FastElemwiseGradBroadcast1CUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
h
,
w
,
x_large
,
dx_op
,
dy_op
,
dx_data
,
dy_data
);
}
};
auto
FastBroadCastAllCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
int
max_dim
,
bool
is_x_large
)
{
int
axis
=
broadcast_pos
[
0
];
int
pre
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
int
mid
=
1
;
int
post
=
1
;
if
(
broadcast_pos
.
size
()
==
1
)
{
mid
=
out_dims_array
[
axis
];
post
=
std
::
accumulate
(
out_dims_array
+
axis
+
1
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
else
{
mid
=
std
::
accumulate
(
out_dims_array
+
axis
,
out_dims_array
+
broadcast_pos
.
back
()
+
1
,
1
,
std
::
multiplies
<
int
>
());
post
=
std
::
accumulate
(
out_dims_array
+
broadcast_pos
.
back
()
+
1
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
}
VLOG
(
3
)
<<
"FastBroadCastAllCUDAF pre:"
<<
pre
<<
" mid:"
<<
mid
<<
" post:"
<<
post
;
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
mid
);
int
grid_size
=
pre
*
post
;
FastCommonGradBroadcastAllCUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
pre
,
mid
,
post
,
is_x_large
,
dx_op
,
dy_op
,
dx_data
,
dy_data
);
};
auto
FastBroadCastOneCUDAF
=
[
&
](
const
std
::
vector
<
int
>
&
broadcast_pos
,
int
max_dim
,
bool
is_x
)
{
int
axis
=
broadcast_pos
[
0
];
int
pre
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
int
mid
=
out_dims_array
[
axis
];
int
post
=
std
::
accumulate
(
out_dims_array
+
axis
+
1
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
k_pre
;
int
k_mid
;
int
k_post
;
if
(
is_x
)
{
k_pre
=
std
::
accumulate
(
y_dims_array
,
y_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
k_mid
=
y_dims_array
[
axis
];
k_post
=
std
::
accumulate
(
y_dims_array
+
axis
+
1
,
y_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
mid
);
int
grid_size
=
pre
*
post
;
// we need to calc y offset with blockid, so do x_pre/y_pre to get left
// size.
if
(
k_pre
!=
pre
)
k_pre
=
pre
/
k_pre
;
FastCommonGradBroadcastOneCUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
pre
,
mid
,
post
,
k_pre
,
k_mid
,
k_post
,
true
,
dx_op
,
dx_data
);
}
else
{
k_pre
=
std
::
accumulate
(
x_dims_array
,
x_dims_array
+
axis
,
1
,
std
::
multiplies
<
int
>
());
k_mid
=
x_dims_array
[
axis
];
k_post
=
std
::
accumulate
(
x_dims_array
+
axis
+
1
,
x_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
mid
);
int
grid_size
=
pre
*
post
;
if
(
k_pre
!=
pre
)
k_pre
=
pre
/
k_pre
;
FastCommonGradBroadcastOneCUDAKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
dout_data
,
pre
,
mid
,
post
,
k_pre
,
k_mid
,
k_post
,
false
,
dy_op
,
dy_data
);
}
VLOG
(
3
)
<<
"FastBroadCastOneCUDAF pre:"
<<
pre
<<
" mid:"
<<
mid
<<
" post:"
<<
post
;
};
// do fast elementwise if: 1. only one input need to do broadcast, we can
// fallback
// to old fast path.
// 2. if both x and y need broadcast, then do it one by one.
bool
fast_broadcast
=
false
;
if
(
x_broadcast_pos
.
empty
()
&&
!
y_broadcast_pos
.
empty
())
{
can_split_y
=
SplitDims
(
y_broadcast_pos
,
max_dim
);
if
(
can_split_y
)
{
// only y need to do broadcast on h
if
(
y_broadcast_pos
[
0
]
==
0
)
{
FastBroadCastHeightCUDAF
(
y_broadcast_pos
,
true
);
fast_broadcast
=
true
;
}
}
else
if
(
y_broadcast_pos
.
size
()
==
1
||
CheckContiguousDims
(
y_broadcast_pos
))
{
// for only one dim and
// contiguous broadcast.
// If cannot split, which means input has 3 parts
FastBroadCastAllCUDAF
(
y_broadcast_pos
,
max_dim
,
true
);
fast_broadcast
=
true
;
}
}
else
if
(
y_broadcast_pos
.
empty
()
&&
!
x_broadcast_pos
.
empty
())
{
// only x need broadcast
can_split_x
=
SplitDims
(
x_broadcast_pos
,
max_dim
);
if
(
can_split_x
)
{
if
(
x_broadcast_pos
[
0
]
==
0
)
{
FastBroadCastHeightCUDAF
(
x_broadcast_pos
,
false
);
fast_broadcast
=
true
;
}
}
else
if
(
x_broadcast_pos
.
size
()
==
1
||
CheckContiguousDims
(
x_broadcast_pos
))
{
FastBroadCastAllCUDAF
(
x_broadcast_pos
,
max_dim
,
false
);
fast_broadcast
=
true
;
}
}
else
if
(
!
x_broadcast_pos
.
empty
()
&&
!
y_broadcast_pos
.
empty
())
{
// do x and y broadcast each.
can_split_y
=
SplitDims
(
y_broadcast_pos
,
max_dim
);
bool
fast_broadcast_x
=
false
;
bool
fast_broadcast_y
=
false
;
if
(
can_split_y
)
{
// begin at start.
if
(
y_broadcast_pos
[
0
]
==
0
)
{
FastCommonCUDAF
(
y_broadcast_pos
,
true
);
fast_broadcast_y
=
true
;
}
}
else
if
(
y_broadcast_pos
.
size
()
==
1
)
{
FastBroadCastOneCUDAF
(
y_broadcast_pos
,
max_dim
,
false
);
can_split_y
=
true
;
fast_broadcast_y
=
true
;
}
can_split_x
=
SplitDims
(
x_broadcast_pos
,
max_dim
);
if
(
can_split_x
)
{
if
(
x_broadcast_pos
[
0
]
==
0
)
{
FastCommonCUDAF
(
x_broadcast_pos
,
false
);
fast_broadcast_x
=
true
;
}
}
else
if
(
x_broadcast_pos
.
size
()
==
1
)
{
FastBroadCastOneCUDAF
(
x_broadcast_pos
,
max_dim
,
true
);
can_split_x
=
true
;
fast_broadcast_x
=
true
;
}
VLOG
(
3
)
<<
"CommonBroadcast can_split_y:"
<<
can_split_y
<<
" can_split_x:"
<<
can_split_x
;
// if both x and y into fast path then return
if
(
fast_broadcast_x
&&
fast_broadcast_y
)
{
fast_broadcast
=
true
;
}
if
(
can_split_y
&&
can_split_x
&&
fast_broadcast
)
return
;
}
// Should remove memory copy, use reg instead.
if
(
fast_broadcast
)
{
return
;
}
int
x_blocks
=
0
;
int
x_threads
=
0
;
ComputeBroadcastKernelSize
(
x_dims_array
,
out_dims_array
,
&
x_blocks
,
&
x_threads
,
max_dim
);
int
y_blocks
=
0
;
int
y_threads
=
0
;
ComputeBroadcastKernelSize
(
y_dims_array
,
out_dims_array
,
&
y_blocks
,
&
y_threads
,
max_dim
);
auto
x_strides_array_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
x_strides_array_gpu
=
reinterpret_cast
<
int
*>
(
x_strides_array_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
x_strides_array_gpu
,
cplace
,
x_strides_array
.
data
(),
bytes
,
ctx
.
stream
());
auto
y_strides_array_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
y_strides_array_gpu
=
reinterpret_cast
<
int
*>
(
y_strides_array_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
y_strides_array_gpu
,
cplace
,
y_strides_array
.
data
(),
bytes
,
ctx
.
stream
());
auto
out_dims_array_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
out_dims_array_gpu
=
reinterpret_cast
<
int
*>
(
out_dims_array_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
out_dims_array_gpu
,
cplace
,
out_dims_array
,
bytes
,
ctx
.
stream
());
const
int
out_size
=
std
::
accumulate
(
out_dims_array
,
out_dims_array
+
max_dim
,
1
,
std
::
multiplies
<
int
>
());
int
x_block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
x_threads
);
int
y_block_size
=
std
::
min
(
ELEMWISE_MAX_BLOCK_DIM
,
y_threads
);
if
(
dx
)
{
auto
x_strides_order_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
x_strides_order_gpu
=
reinterpret_cast
<
int
*>
(
x_strides_order_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
x_strides_order_gpu
,
cplace
,
x_strides_order
.
data
(),
bytes
,
ctx
.
stream
());
auto
x_dims_order_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
x_dims_order_gpu
=
reinterpret_cast
<
int
*>
(
x_dims_order_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
x_dims_order_gpu
,
cplace
,
x_dims_order
.
data
(),
bytes
,
ctx
.
stream
());
CommonGradBroadcastCUDAKernel
<
T
,
DX_OP
,
Tout
><<<
x_blocks
,
x_block_size
,
0
,
ctx
.
stream
()
>>>
(
x_strides_array_gpu
,
y_strides_array_gpu
,
out_dims_array_gpu
,
x_strides_order_gpu
,
x_dims_order_gpu
,
x_data
,
y_data
,
out_data
,
dout_data
,
dx_data
,
out_size
,
max_dim
,
x_threads
,
dx_op
);
}
if
(
dy
)
{
auto
y_strides_order_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
y_strides_order_gpu
=
reinterpret_cast
<
int
*>
(
y_strides_order_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
y_strides_order_gpu
,
cplace
,
y_strides_order
.
data
(),
bytes
,
ctx
.
stream
());
auto
y_dims_order_tmp
=
paddle
::
memory
::
Alloc
(
ctx
,
bytes
);
int
*
y_dims_order_gpu
=
reinterpret_cast
<
int
*>
(
y_dims_order_tmp
->
ptr
());
paddle
::
memory
::
Copy
(
gplace
,
y_dims_order_gpu
,
cplace
,
y_dims_order
.
data
(),
bytes
,
ctx
.
stream
());
CommonGradBroadcastCUDAKernel
<
T
,
DY_OP
,
Tout
><<<
y_blocks
,
y_block_size
,
0
,
ctx
.
stream
()
>>>
(
x_strides_array_gpu
,
y_strides_array_gpu
,
out_dims_array_gpu
,
y_strides_order_gpu
,
y_dims_order_gpu
,
x_data
,
y_data
,
out_data
,
dout_data
,
dy_data
,
out_size
,
max_dim
,
y_threads
,
dy_op
);
}
}
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
CommonElementwiseBroadcastBackward
(
const
GPUContext
&
ctx
,
const
DDim
&
x_dims
,
const
DDim
&
y_dims
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
&
out
,
const
DenseTensor
&
dout
,
int
axis
,
DenseTensor
*
dx
,
DenseTensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
funcs
::
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
// for inplace strategy. memset will make dx and dout clear and get wrong
// result.
if
(
dx
&&
dx
->
IsSharedBufferWith
(
dout
))
{
dx
->
clear
();
dx
->
mutable_data
<
T
>
(
x_dims
,
ctx
.
GetPlace
());
}
VLOG
(
3
)
<<
"CommonElementwiseBroadcastBackward xdims:"
<<
paddle
::
framework
::
make_ddim
(
x_dims_array
)
<<
" ydim:"
<<
paddle
::
framework
::
make_ddim
(
y_dims_array
);
CommonGradBroadcastCUDA
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
x
,
y
,
out
,
dout
,
dx
,
dy
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
ctx
,
dx_op
,
dy_op
);
}
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
,
typename
Tout
=
T
>
void
ElemwiseGradComputeWithBroadcast
(
const
GPUContext
&
ctx
,
const
DDim
&
x_dims
,
const
DDim
&
y_dims
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
&
out
,
const
DenseTensor
&
dout
,
int
axis
,
DenseTensor
*
dx
,
DenseTensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
bool
is_xsize_larger
=
true
;
int
max_dim
=
x_dims
.
size
();
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
is_xsize_larger
=
false
;
max_dim
=
y_dims
.
size
();
}
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
PADDLE_ENFORCE_GE
(
axis
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"Axis should be great than or equal to 0, but received axis is %d."
,
axis
));
PADDLE_ENFORCE_LT
(
axis
,
max_dim
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"Axis should be less than %d, but received axis is %d."
,
max_dim
,
axis
));
int
pre
,
n
,
post
,
is_run_common_broadcast
,
axis_trim
=
0
;
if
(
is_xsize_larger
)
{
auto
y_dims_trimed
=
funcs
::
trim_trailing_singular_dims
(
y_dims
);
axis_trim
=
(
y_dims_trimed
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
funcs
::
get_mid_dims
(
x_dims
,
y_dims_trimed
,
axis_trim
,
&
pre
,
&
n
,
&
post
,
&
is_run_common_broadcast
);
}
else
{
auto
x_dims_trimed
=
funcs
::
trim_trailing_singular_dims
(
x_dims
);
axis_trim
=
(
x_dims_trimed
.
size
()
==
0
)
?
y_dims
.
size
()
:
axis
;
funcs
::
get_mid_dims
(
y_dims
,
x_dims_trimed
,
axis_trim
,
&
pre
,
&
n
,
&
post
,
&
is_run_common_broadcast
);
}
// special case for common backward implementation.
if
(
is_run_common_broadcast
)
{
CommonElementwiseBroadcastBackward
<
T
,
DX_OP
,
DY_OP
,
Tout
>
(
ctx
,
x_dims
,
y_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
return
;
}
if
(
post
==
1
)
{
ElemwiseGradBroadcast1CUDA
(
ctx
.
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
else
{
ElemwiseGradBroadcast2CUDA
(
ctx
.
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
Tout
>
(),
dout
.
data
<
Tout
>
(),
pre
,
n
,
post
,
is_xsize_larger
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
// namespace pten
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录