Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
1a0cd447
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1a0cd447
编写于
8月 23, 2022
作者:
N
niuliling123
提交者:
GitHub
8月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Delete the template parameter BLockSize in Kernel Primitive API (#45220)
上级
3a7b1810
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
136 addition
and
295 deletion
+136
-295
paddle/fluid/operators/dropout_impl.cu.h
paddle/fluid/operators/dropout_impl.cu.h
+18
-18
paddle/fluid/operators/fused/attn_bias_add.cu.h
paddle/fluid/operators/fused/attn_bias_add.cu.h
+5
-6
paddle/phi/kernels/funcs/broadcast_function.h
paddle/phi/kernels/funcs/broadcast_function.h
+2
-2
paddle/phi/kernels/funcs/distribution_helper.h
paddle/phi/kernels/funcs/distribution_helper.h
+3
-4
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+9
-9
paddle/phi/kernels/funcs/index_impl.cu.h
paddle/phi/kernels/funcs/index_impl.cu.h
+6
-6
paddle/phi/kernels/funcs/reduce_function.h
paddle/phi/kernels/funcs/reduce_function.h
+20
-35
paddle/phi/kernels/funcs/select_impl.cu.h
paddle/phi/kernels/funcs/select_impl.cu.h
+17
-17
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+20
-24
paddle/phi/kernels/primitive/compute_primitives.h
paddle/phi/kernels/primitive/compute_primitives.h
+11
-63
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+7
-51
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+9
-30
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+9
-30
未找到文件。
paddle/fluid/operators/dropout_impl.cu.h
浏览文件 @
1a0cd447
...
...
@@ -112,17 +112,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
auto
dst_functor
=
DstMaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
,
is_upscale_in_train
);
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
kps
::
WriteData
<
T
,
kCount
,
1
,
1
,
false
>
(
dst
+
fix
,
&
dst_mask
[
0
],
deal_size
);
kps
::
WriteData
<
T
,
kCount
,
1
,
false
>
(
dst
+
fix
,
&
dst_mask
[
0
],
deal_size
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
false
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
false
>
(
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
if
(
fix
>
idx
*
kCount
+
1
)
{
__syncthreads
();
...
...
@@ -130,17 +130,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
}
int
remainder
=
n
-
fix
;
if
(
remainder
>
0
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
kps
::
WriteData
<
T
,
kCount
,
1
,
1
,
true
>
(
dst
+
fix
,
&
dst_mask
[
0
],
remainder
);
kps
::
WriteData
<
T
,
kCount
,
1
,
true
>
(
dst
+
fix
,
&
dst_mask
[
0
],
remainder
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
true
>
(
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
__syncthreads
();
}
...
...
@@ -233,17 +233,17 @@ __global__ void VectorizedGeneratorMask(const size_t n,
auto
mask_functor
=
MaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
);
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
false
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
false
>
(
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
if
(
fix
>
idx
*
kCount
+
1
)
{
__syncthreads
();
...
...
@@ -251,16 +251,16 @@ __global__ void VectorizedGeneratorMask(const size_t n,
}
int
remainder
=
n
-
fix
;
if
(
remainder
>
0
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
true
>
(
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
__syncthreads
();
}
...
...
paddle/fluid/operators/fused/attn_bias_add.cu.h
浏览文件 @
1a0cd447
...
...
@@ -73,24 +73,23 @@ __global__ void BroadcastKernelBinary(
// load in0
if
(
use_broadcast
[
0
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
>
(
arg0
,
in0
,
fix
,
configlists
[
0
],
numel
);
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg0
,
in0
+
fix
,
num
);
}
// load in1
if
(
use_broadcast
[
1
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
>
(
arg1
,
in1
,
fix
,
configlists
[
1
],
numel
);
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg1
,
in1
+
fix
,
num
);
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
>
(
arg1
,
in1
+
fix
,
num
);
}
// compute
kernel_primitives
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kernel_primitives
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
arg0
,
arg1
,
func
);
// store
kernel_primitives
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
true
>
(
out
+
fix
,
result
,
num
);
kernel_primitives
::
WriteData
<
OutT
,
VecSize
,
1
,
true
>
(
out
+
fix
,
result
,
num
);
}
// bias add forward impl for "[m, n] + [n] = [m, n]"
...
...
paddle/phi/kernels/funcs/broadcast_function.h
浏览文件 @
1a0cd447
...
...
@@ -266,10 +266,10 @@ __device__ __forceinline__ void LoadData(
// numel : whole num of output
// num: how many data will be deal with in this time
if
(
need_broadcast
)
{
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
IsBoundary
>
(
dst
,
src
,
block_offset
,
config
,
numel
,
read_lens
);
}
else
{
kps
::
ReadData
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadData
<
T
,
VecSize
,
1
,
IsBoundary
>
(
dst
,
src
+
block_offset
,
num
,
read_lens
);
}
}
...
...
paddle/phi/kernels/funcs/distribution_helper.h
浏览文件 @
1a0cd447
...
...
@@ -278,11 +278,10 @@ __global__ void DistributionKernel(size_t size,
MT
args
[
kCount
];
T
result
[
kCount
];
for
(
size_t
i
=
idx
;
i
<
size
;
i
+=
total_thread
*
kCount
)
{
kps
::
ElementwiseRandom
<
SType
,
MT
,
kCount
,
1
,
DistOp
>
(
&
args
[
0
],
dist
,
&
state
);
kps
::
ElementwiseUnary
<
MT
,
T
,
kCount
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseRandom
<
SType
,
MT
,
kCount
,
DistOp
>
(
&
args
[
0
],
dist
,
&
state
);
kps
::
ElementwiseUnary
<
MT
,
T
,
kCount
,
1
,
TransformOp
>
(
&
result
[
0
],
&
args
[
0
],
trans
);
kps
::
WriteData
<
T
,
T
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
T
,
T
,
kCount
,
1
,
true
>
(
out_data
+
i
,
&
result
[
0
],
size
-
i
,
1
,
stride
,
1
);
__syncthreads
();
}
...
...
paddle/phi/kernels/funcs/elementwise_base.h
浏览文件 @
1a0cd447
...
...
@@ -519,13 +519,13 @@ struct Loader {
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
),
read_lens
);
if
(
is_boundary
)
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
true
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
ArgsT
,
Index
,
true
>
(
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
num
,
read_lens
);
}
else
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
false
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
ArgsT
,
Index
,
false
>
(
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
num
,
...
...
@@ -595,7 +595,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseAny
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Arity
,
Functor
>
(
kps
::
ElementwiseAny
<
InT
,
OutT
,
VecSize
,
1
,
Arity
,
Functor
>
(
result
,
args
,
func
);
}
};
...
...
@@ -606,7 +606,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseConstant
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
result
,
func
);
kps
::
ElementwiseConstant
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
func
);
}
};
...
...
@@ -616,7 +616,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
func
);
}
};
...
...
@@ -627,7 +627,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
args
[
1
],
func
,
read_lens
);
}
};
...
...
@@ -638,7 +638,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseTernary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseTernary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
args
[
1
],
args
[
2
],
func
);
}
};
...
...
@@ -703,7 +703,7 @@ struct ElementwiseWriteDataCallerBc {
}
#pragma unroll
for
(
int
i
=
0
;
i
<
NumOuts
;
++
i
)
{
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
outs
[
i
]
+
block_offset
,
dst
[
i
],
num
,
read_lens
);
}
}
...
...
@@ -716,7 +716,7 @@ struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
kps
::
IndexType
block_offset
,
int
num
,
int
read_lens
)
{
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
outs
[
0
]
+
block_offset
,
src
,
num
,
read_lens
);
}
};
...
...
paddle/phi/kernels/funcs/index_impl.cu.h
浏览文件 @
1a0cd447
...
...
@@ -36,18 +36,18 @@ __global__ void VectorizedIndexKernel(T *out,
size_t
args
[
VecSize
];
T
result
[
VecSize
];
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
args
[
0
],
func
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
1
,
false
>
(
kps
::
WriteData
<
T
,
VecSize
,
1
,
false
>
(
out
+
data_offset
,
&
result
[
0
],
BLOCK_NUM_X
*
VecSize
);
}
size_t
num
=
numel
-
data_offset
;
if
(
num
>
0
)
{
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
args
[
0
],
func
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
1
,
true
>
(
out
+
data_offset
,
&
result
[
0
],
num
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
true
>
(
out
+
data_offset
,
&
result
[
0
],
num
);
}
}
...
...
paddle/phi/kernels/funcs/reduce_function.h
浏览文件 @
1a0cd447
...
...
@@ -712,7 +712,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1
,
REDUCE_VEC_SIZE
,
1
,
1
,
Calculator
,
kps
::
IdentityFunctor
<
Tx
>
,
false
>
(
&
input_reg
[
0
],
...
...
@@ -725,12 +724,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
stride
,
kps
::
IdentityFunctor
<
Tx
>
(),
reduce_last_dim
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
REDUCE_VEC_SIZE
,
1
,
TransformOp
>
(
&
input_compute
[
0
],
&
input_reg
[
0
],
transformer
);
kps
::
Reduce
<
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
...
...
@@ -742,7 +740,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1
,
REDUCE_VEC_SIZE
,
1
,
1
,
Calculator
,
TransformOp
,
true
>
(
&
input_compute
[
0
],
...
...
@@ -758,12 +755,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps
::
Reduce
<
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
&
reduce_var
,
&
reduce_var
,
reducer
,
reduce_last_dim
);
if
(
is_mean
)
{
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
reduce_num
);
...
...
@@ -807,27 +803,22 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType
reduce_var
=
init
;
MPType
reduce_compute
=
init
;
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
1
,
false
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
block
.
BlockDimX
(),
1
,
1
,
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
1
,
TransformOp
>
(
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
false
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
block
.
BlockDimX
(),
1
,
1
,
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
TransformOp
>
(
&
reduce_compute
,
&
reduce_input
,
transformer
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
}
if
(
is_mean
)
{
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
WriteData
<
Ty
,
1
,
1
,
1
,
false
>
(
kps
::
WriteData
<
Ty
,
1
,
1
,
false
>
(
y
+
store_offset
+
idx
,
&
result
,
block
.
BlockDimX
());
}
...
...
@@ -835,20 +826,15 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType
reduce_var
=
init
;
MPType
reduce_compute
=
init
;
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
1
,
true
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
dim
.
rem_x
,
1
,
1
,
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
1
,
TransformOp
>
(
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
true
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
dim
.
rem_x
,
1
,
1
,
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
TransformOp
>
(
&
reduce_compute
,
&
reduce_input
,
transformer
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
}
...
...
@@ -856,8 +842,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
WriteData
<
Ty
,
1
,
1
,
1
,
true
>
(
y
+
store_offset
+
idx
,
&
result
,
dim
.
rem_x
);
kps
::
WriteData
<
Ty
,
1
,
1
,
true
>
(
y
+
store_offset
+
idx
,
&
result
,
dim
.
rem_x
);
}
}
...
...
paddle/phi/kernels/funcs/select_impl.cu.h
浏览文件 @
1a0cd447
...
...
@@ -71,21 +71,21 @@ __device__ void GetBlockCountImpl(const InT *in,
int
store_fix
=
BLOCK_ID_X
+
repeat
*
GRID_NUM_X
;
kps
::
Init
<
InT
,
VecSize
>
(
&
in_data
[
0
],
static_cast
<
InT
>
(
0.0
f
));
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Cast
>
(
kps
::
ReadData
<
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
Cast
>
(
&
temp
[
0
],
&
in_data
[
0
],
Cast
());
kps
::
Reduce
<
OutT
,
VecSize
,
1
,
1
,
Add
,
Mode
::
kLocalMode
>
(
kps
::
Reduce
<
OutT
,
VecSize
,
1
,
Add
,
Mode
::
kLocalMode
>
(
&
result
,
&
temp
[
0
],
Add
(),
true
);
kps
::
Reduce
<
OutT
,
1
,
1
,
1
,
Add
,
Mode
::
kGlobalMode
>
(
kps
::
Reduce
<
OutT
,
1
,
1
,
Add
,
Mode
::
kGlobalMode
>
(
&
result
,
&
result
,
Add
(),
true
);
if
(
store_fix
==
0
)
{
// first block's fix_size = 0;
OutT
tmp
=
static_cast
<
OutT
>
(
0.0
f
);
kps
::
WriteData
<
OutT
,
1
,
1
,
1
,
true
>
(
out
+
store_fix
,
&
tmp
,
1
);
kps
::
WriteData
<
OutT
,
1
,
1
,
true
>
(
out
+
store_fix
,
&
tmp
,
1
);
}
// store num of this block
kps
::
WriteData
<
OutT
,
1
,
1
,
1
,
true
>
(
out
+
store_fix
+
1
,
&
result
,
1
);
kps
::
WriteData
<
OutT
,
1
,
1
,
true
>
(
out
+
store_fix
+
1
,
&
result
,
1
);
}
// Count how many data is not zero in current block
...
...
@@ -132,12 +132,12 @@ __device__ void CumsumImpl(
// set pre_cumsum
kps
::
Init
<
OutT
,
VecSize
>
(
&
temp
[
0
],
*
pre_cumsum
);
// load data to arg
kps
::
ReadData
<
InT
,
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadData
<
InT
,
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
arg
[
0
],
in
,
num
,
1
,
BLOCK_NUM_X
,
1
);
// block cumsum
kps
::
Cumsum
<
InT
,
OutT
,
1
,
Functor
>
(
&
result
[
0
],
&
arg
[
0
],
func
);
kps
::
Cumsum
<
InT
,
OutT
,
Functor
>
(
&
result
[
0
],
&
arg
[
0
],
func
);
// result = cumsum_result + pre_cumsum
kps
::
ElementwiseBinary
<
OutT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseBinary
<
OutT
,
OutT
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
result
[
0
],
&
temp
[
0
],
func
);
// get the last prefix sum
if
((
THREAD_ID_X
==
BLOCK_NUM_X
-
1
)
&&
!
IsBoundary
)
{
...
...
@@ -146,7 +146,7 @@ __device__ void CumsumImpl(
__syncthreads
();
// update pre_cumsum
*
pre_cumsum
=
max_thread_data
;
kps
::
WriteData
<
OutT
,
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
OutT
,
VecSize
,
1
,
IsBoundary
>
(
out
,
&
result
[
0
],
num
,
1
,
BLOCK_NUM_X
,
1
);
}
...
...
@@ -189,7 +189,7 @@ struct SelectCaller {
int64_t
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
// set index
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
,
1
>
(
&
in_data
[
0
],
data_offset
);
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
>
(
&
in_data
[
0
],
data_offset
);
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
int64_t
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
...
...
@@ -215,7 +215,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
int
num
)
{
InT
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ReadData
<
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
...
...
@@ -244,7 +244,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
kps
::
details
::
ReadData
<
InT
>
(
&
in_data
[
0
],
in
+
thread_fix
,
store_num
);
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
}
};
...
...
@@ -285,16 +285,16 @@ __device__ void SelectKernelImpl(OutT *out,
kps
::
Init
<
IdT
,
kCVecSize
>
(
&
num_thread
[
0
],
init_idx
);
kps
::
Init
<
MT
,
VecSize
>
(
&
mask_data
[
0
],
init_mask
);
// Load mask
kps
::
ReadData
<
MT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
mask_data
[
0
],
mask
,
num
);
kps
::
ReadData
<
MT
,
VecSize
,
1
,
IsBoundary
>
(
&
mask_data
[
0
],
mask
,
num
);
// Cast from MT to int
kps
::
ElementwiseUnary
<
MT
,
IdT
,
VecSize
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
MT
,
IdT
,
VecSize
,
1
,
Cast
>
(
&
mask_idt
[
0
],
&
mask_data
[
0
],
Cast
());
// Get the num of thread only num_thread[1] has data
kps
::
Reduce
<
IdT
,
VecSize
,
1
,
1
,
Add
,
Mode
::
kLocalMode
>
(
kps
::
Reduce
<
IdT
,
VecSize
,
1
,
Add
,
Mode
::
kLocalMode
>
(
&
num_thread
[
0
],
&
mask_idt
[
0
],
Add
(),
true
);
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// thread_fix
kps
::
Cumsum
<
IdT
,
IdT
,
1
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
kps
::
Cumsum
<
IdT
,
IdT
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
// get thread_fix
int
thread_fix
=
(
static_cast
<
int
>
(
cumsum_thread
[
0
]
-
num_thread
[
0
])
*
store_rank
);
...
...
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
浏览文件 @
1a0cd447
...
...
@@ -311,9 +311,9 @@ __global__ void WarpSoftmaxForward(T* softmax,
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
src_data
[
i
][
0
][
0
]);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
reg_v
[
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
kps
::
ElementwiseUnary
<
T
,
AccT
,
kVItem
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kVItem
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
src_data
[
i
][
0
][
0
],
DataTransFunctor
<
T
,
AccT
>
());
}
...
...
@@ -321,7 +321,6 @@ __global__ void WarpSoftmaxForward(T* softmax,
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
ReduceMaxFunctor
<
AccT
>
,
kMode
::
kLocalMode
>
(
&
max
[
0
],
&
sub_data
[
0
][
0
][
0
],
ReduceMaxFunctor
<
AccT
>
(),
true
);
...
...
@@ -330,15 +329,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
// compute sum
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
max
[
i
]));
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
ExpFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpFunctor
<
AccT
>>
(
&
exp_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
ExpFunctor
<
AccT
>
());
}
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kLocalMode
>
(
&
sum
[
0
],
&
exp_data
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
...
...
@@ -351,15 +349,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
reinterpret_cast
<
VecT
*>
(
&
softmax
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
if
(
LogMode
)
{
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
std
::
log
(
sum
[
i
])));
}
else
{
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
1
,
UnaryDivFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
UnaryDivFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
exp_data
[
i
][
0
][
0
],
UnaryDivFunctor
<
AccT
>
(
sum
[
i
]));
}
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
softmax_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
}
}
...
...
@@ -417,9 +415,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
int
ptr
=
(
first_batch
+
i
)
*
stride
;
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[
ptr
]);
const
VecT
*
grad_v
=
reinterpret_cast
<
const
VecT
*>
(
&
grad
[
ptr
]);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
src_reg
[
i
][
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
grad_reg
[
i
][
0
],
&
grad_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
}
...
...
@@ -430,9 +428,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
const
T
*
grad_ptr
=
reinterpret_cast
<
const
T
*>
(
&
grad_reg
[
0
][
0
]);
constexpr
int
kStep
=
kBatchSize
*
kLoopsV
*
kVSize
;
constexpr
int
kVItem
=
kLoopsV
*
kVSize
;
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
src_tmp
[
0
][
0
][
0
],
&
src_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
grad_tmp
[
0
][
0
][
0
],
&
grad_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
// compute sum
...
...
@@ -444,17 +442,15 @@ __global__ void WarpSoftmaxBackward(T* dst,
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
sum
[
0
],
&
grad_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
}
else
{
kps
::
ElementwiseBinary
<
AccT
,
AccT
,
kStep
,
1
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
AccT
,
kStep
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
&
sum_tmp
[
0
][
0
][
0
],
&
gradptr
[
0
],
&
srcptr
[
0
],
kps
::
MulFunctor
<
AccT
>
());
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
sum
[
0
],
&
sum_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
...
...
@@ -470,17 +466,17 @@ __global__ void WarpSoftmaxBackward(T* dst,
AccT
*
gradptr
=
reinterpret_cast
<
AccT
*>
(
&
grad_tmp
[
i
][
0
][
0
]);
AccT
*
srcptr
=
reinterpret_cast
<
AccT
*>
(
&
src_tmp
[
i
][
0
][
0
]);
if
(
LogMode
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
ExpMulFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpMulFunctor
<
AccT
>>
(
&
out
[
i
][
0
][
0
],
&
srcptr
[
0
],
ExpMulFunctor
<
AccT
>
(
sum
[
i
]));
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
1
,
kps
::
SubFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
kps
::
SubFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
gradptr
[
0
],
&
out
[
i
][
0
][
0
],
kps
::
SubFunctor
<
AccT
>
());
}
else
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
out
[
i
][
0
][
0
],
&
gradptr
[
0
],
UnarySubFunctor
<
AccT
>
(
sum
[
i
]));
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
srcptr
[
0
],
&
out
[
i
][
0
][
0
],
...
...
@@ -488,7 +484,7 @@ __global__ void WarpSoftmaxBackward(T* dst,
}
VecT
*
dst_v
=
reinterpret_cast
<
VecT
*>
(
&
dst
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
dst_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
}
}
...
...
@@ -636,7 +632,7 @@ __global__ void NormalSoftmaxForward(
}
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
MaxFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
MaxFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
max_value
,
&
max_value
,
kps
::
MaxFunctor
<
AccT
>
(),
false
);
}
...
...
@@ -647,7 +643,7 @@ __global__ void NormalSoftmaxForward(
sum
+=
std
::
exp
(
value
-
max_value
);
}
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
}
...
...
@@ -695,7 +691,7 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
}
}
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
}
...
...
paddle/phi/kernels/primitive/compute_primitives.h
浏览文件 @
1a0cd447
...
...
@@ -200,7 +200,6 @@ __device__ inline int GetLastPow2(int n) {
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
...
...
@@ -215,12 +214,7 @@ __device__ inline int GetLastPow2(int n) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
...
...
@@ -239,7 +233,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
...
...
@@ -255,12 +248,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -271,12 +259,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
#pragma unroll
...
...
@@ -294,7 +277,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -312,12 +294,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
...
...
@@ -335,7 +312,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Arity: The size of ins.
* OpFunc: Compute functor which has an operator() as following:
...
...
@@ -351,13 +327,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
Arity
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
...
...
@@ -382,7 +352,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
...
...
@@ -398,12 +367,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -428,7 +392,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -448,7 +411,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
...
...
@@ -494,7 +456,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -509,12 +470,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
...
@@ -532,7 +488,6 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* hiprandStatePhilox4_32_10_t.
* OutT: the type of out register.
* ReturnsCount: The number of random data generated by OpFunc.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
...
...
@@ -549,11 +504,7 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* compute: Compute function which was declared like OpFunc<T>().
*/
template
<
typename
StateType
,
typename
OutT
,
int
ReturnsCount
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
StateType
,
typename
OutT
,
int
ReturnsCount
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseRandom
(
OutT
*
out
,
OpFunc
compute
,
StateType
*
state
)
{
...
...
@@ -571,7 +522,6 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
* @template paraments
* InT: the type of input register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
...
...
@@ -589,7 +539,7 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
*/
#define SHARED_SIZE_LIMIT 512
template
<
typename
InT
,
typename
OutT
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
class
OpFunc
>
__device__
__forceinline__
void
Cumsum
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
...
...
@@ -632,7 +582,6 @@ __device__ __forceinline__ void Cumsum(OutT* out,
* @template paraments
* InT: the type of input register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
*
* @param
...
...
@@ -645,7 +594,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
#define SHARED_SIZE_LIMIT 1024
// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
// larger than blockDim.x * 2
template
<
typename
InT
,
typename
OutT
,
int
BlockSize
>
template
<
typename
InT
,
typename
OutT
>
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
const
InT
*
in
,
int
num
,
...
...
@@ -689,7 +638,6 @@ __device__ __forceinline__ void Sort(OutT* out,
* InT: The type of input register.
* OutT: The type of out register.
* IndexType: The type of index.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
*
* @param
...
...
@@ -701,7 +649,7 @@ __device__ __forceinline__ void Sort(OutT* out,
* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
* sorted in escending.
*/
template
<
typename
InT
,
typename
OutT
,
typename
IndexType
,
int
BlockSize
>
template
<
typename
InT
,
typename
OutT
,
typename
IndexType
>
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
IndexType
*
out_index
,
const
InT
*
in
,
...
...
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
浏览文件 @
1a0cd447
...
...
@@ -89,7 +89,6 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
...
...
@@ -104,12 +103,7 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
...
...
@@ -128,7 +122,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
...
...
@@ -144,12 +137,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -160,12 +148,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
...
...
@@ -182,7 +165,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -200,12 +182,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
...
...
@@ -223,7 +200,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Arity: The size of ins
* OpFunc: Compute functor which has an operator() as following:
...
...
@@ -239,13 +215,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
Arity
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
...
...
@@ -270,7 +240,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
...
...
@@ -286,12 +255,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -316,7 +280,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -336,7 +299,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
...
...
@@ -369,7 +331,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -384,12 +345,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
1a0cd447
...
...
@@ -144,7 +144,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -161,12 +160,7 @@ __device__ __forceinline__ void ReadData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
size_nx
,
...
...
@@ -275,7 +269,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -287,7 +280,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
)
{
...
...
@@ -319,7 +312,7 @@ __device__ __forceinline__ void ReadData(T* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
,
...
...
@@ -361,7 +354,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -376,7 +368,6 @@ __device__ __forceinline__ void ReadData(T* dst,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
typename
ArgsT
,
int
Index
,
bool
IsBoundary
=
false
>
...
...
@@ -419,7 +410,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -437,7 +427,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
...
...
@@ -479,7 +469,6 @@ __device__ __forceinline__ void ReadDataBc(
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -507,7 +496,6 @@ template <typename Tx,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
typename
Functor
,
...
...
@@ -572,7 +560,6 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -584,7 +571,7 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* src: The register pointer, the size is NX * NY.
* size: The current block needs to load size elements continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
)
{
...
...
@@ -613,7 +600,7 @@ __device__ __forceinline__ void WriteData(T* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
,
...
...
@@ -652,7 +639,6 @@ __device__ __forceinline__ void WriteData(T* dst,
* Ty: The type of data that stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -669,12 +655,7 @@ __device__ __forceinline__ void WriteData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
size_nx
,
...
...
@@ -766,7 +747,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -782,7 +762,7 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
...
...
@@ -820,14 +800,13 @@ __device__ __forceinline__ void ReadDataBc(
* T: Data type of register.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
>
template
<
typename
T
,
int
NX
,
int
NY
>
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
int
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
#pragma unroll
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
1a0cd447
...
...
@@ -337,7 +337,6 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -354,12 +353,7 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
const
Tx
_global_ptr_
*
src
,
int
size_nx
,
...
...
@@ -472,7 +466,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -484,7 +477,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
)
{
...
...
@@ -502,7 +495,7 @@ __device__ __inline__ void ReadData(T* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
,
...
...
@@ -531,7 +524,6 @@ __device__ __inline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -546,7 +538,6 @@ __device__ __inline__ void ReadData(T* dst,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
typename
ArgsT
,
int
Index
,
bool
IsBoundary
>
...
...
@@ -582,7 +573,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -599,7 +589,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
...
...
@@ -634,7 +624,6 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -662,7 +651,6 @@ template <typename Tx,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
typename
Functor
,
...
...
@@ -733,7 +721,6 @@ __device__ __forceinline__ void ReadDataReduce(
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -746,7 +733,7 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
int
num
,
...
...
@@ -766,7 +753,7 @@ __device__ void WriteData(T _global_ptr_* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
int
num
)
{
int
thread_offset
=
core_id
()
*
NX
;
mfence_local
();
...
...
@@ -793,7 +780,6 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* Ty: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -810,12 +796,7 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
const
Tx
*
src
,
int
size_nx
,
...
...
@@ -1190,7 +1171,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -1206,7 +1186,7 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* read_lens: The number of data continuously loaded by each thread.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
...
...
@@ -1238,14 +1218,13 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: Data type of register.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
>
template
<
typename
T
,
int
NX
,
int
NY
>
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
int
thread_offset
=
block_offset
+
core_id
()
*
NX
;
#pragma unroll
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录