Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
1a0cd447
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1a0cd447
编写于
8月 23, 2022
作者:
N
niuliling123
提交者:
GitHub
8月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Delete the template parameter BLockSize in Kernel Primitive API (#45220)
上级
3a7b1810
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
136 addition
and
295 deletion
+136
-295
paddle/fluid/operators/dropout_impl.cu.h
paddle/fluid/operators/dropout_impl.cu.h
+18
-18
paddle/fluid/operators/fused/attn_bias_add.cu.h
paddle/fluid/operators/fused/attn_bias_add.cu.h
+5
-6
paddle/phi/kernels/funcs/broadcast_function.h
paddle/phi/kernels/funcs/broadcast_function.h
+2
-2
paddle/phi/kernels/funcs/distribution_helper.h
paddle/phi/kernels/funcs/distribution_helper.h
+3
-4
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+9
-9
paddle/phi/kernels/funcs/index_impl.cu.h
paddle/phi/kernels/funcs/index_impl.cu.h
+6
-6
paddle/phi/kernels/funcs/reduce_function.h
paddle/phi/kernels/funcs/reduce_function.h
+20
-35
paddle/phi/kernels/funcs/select_impl.cu.h
paddle/phi/kernels/funcs/select_impl.cu.h
+17
-17
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+20
-24
paddle/phi/kernels/primitive/compute_primitives.h
paddle/phi/kernels/primitive/compute_primitives.h
+11
-63
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+7
-51
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+9
-30
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+9
-30
未找到文件。
paddle/fluid/operators/dropout_impl.cu.h
浏览文件 @
1a0cd447
...
...
@@ -112,17 +112,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
auto
dst_functor
=
DstMaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
,
is_upscale_in_train
);
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
kps
::
WriteData
<
T
,
kCount
,
1
,
1
,
false
>
(
dst
+
fix
,
&
dst_mask
[
0
],
deal_size
);
kps
::
WriteData
<
T
,
kCount
,
1
,
false
>
(
dst
+
fix
,
&
dst_mask
[
0
],
deal_size
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
false
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
false
>
(
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
if
(
fix
>
idx
*
kCount
+
1
)
{
__syncthreads
();
...
...
@@ -130,17 +130,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
}
int
remainder
=
n
-
fix
;
if
(
remainder
>
0
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
kps
::
WriteData
<
T
,
kCount
,
1
,
1
,
true
>
(
dst
+
fix
,
&
dst_mask
[
0
],
remainder
);
kps
::
WriteData
<
T
,
kCount
,
1
,
true
>
(
dst
+
fix
,
&
dst_mask
[
0
],
remainder
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
true
>
(
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
__syncthreads
();
}
...
...
@@ -233,17 +233,17 @@ __global__ void VectorizedGeneratorMask(const size_t n,
auto
mask_functor
=
MaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
);
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
false
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
false
>
(
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
if
(
fix
>
idx
*
kCount
+
1
)
{
__syncthreads
();
...
...
@@ -251,16 +251,16 @@ __global__ void VectorizedGeneratorMask(const size_t n,
}
int
remainder
=
n
-
fix
;
if
(
remainder
>
0
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ReadData
<
T
,
kCount
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
// dst
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
true
>
(
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
__syncthreads
();
}
...
...
paddle/fluid/operators/fused/attn_bias_add.cu.h
浏览文件 @
1a0cd447
...
...
@@ -73,24 +73,23 @@ __global__ void BroadcastKernelBinary(
// load in0
if
(
use_broadcast
[
0
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
>
(
arg0
,
in0
,
fix
,
configlists
[
0
],
numel
);
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg0
,
in0
+
fix
,
num
);
}
// load in1
if
(
use_broadcast
[
1
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
>
(
arg1
,
in1
,
fix
,
configlists
[
1
],
numel
);
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg1
,
in1
+
fix
,
num
);
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
>
(
arg1
,
in1
+
fix
,
num
);
}
// compute
kernel_primitives
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kernel_primitives
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
arg0
,
arg1
,
func
);
// store
kernel_primitives
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
true
>
(
out
+
fix
,
result
,
num
);
kernel_primitives
::
WriteData
<
OutT
,
VecSize
,
1
,
true
>
(
out
+
fix
,
result
,
num
);
}
// bias add forward impl for "[m, n] + [n] = [m, n]"
...
...
paddle/phi/kernels/funcs/broadcast_function.h
浏览文件 @
1a0cd447
...
...
@@ -266,10 +266,10 @@ __device__ __forceinline__ void LoadData(
// numel : whole num of output
// num: how many data will be deal with in this time
if
(
need_broadcast
)
{
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
IsBoundary
>
(
dst
,
src
,
block_offset
,
config
,
numel
,
read_lens
);
}
else
{
kps
::
ReadData
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadData
<
T
,
VecSize
,
1
,
IsBoundary
>
(
dst
,
src
+
block_offset
,
num
,
read_lens
);
}
}
...
...
paddle/phi/kernels/funcs/distribution_helper.h
浏览文件 @
1a0cd447
...
...
@@ -278,11 +278,10 @@ __global__ void DistributionKernel(size_t size,
MT
args
[
kCount
];
T
result
[
kCount
];
for
(
size_t
i
=
idx
;
i
<
size
;
i
+=
total_thread
*
kCount
)
{
kps
::
ElementwiseRandom
<
SType
,
MT
,
kCount
,
1
,
DistOp
>
(
&
args
[
0
],
dist
,
&
state
);
kps
::
ElementwiseUnary
<
MT
,
T
,
kCount
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseRandom
<
SType
,
MT
,
kCount
,
DistOp
>
(
&
args
[
0
],
dist
,
&
state
);
kps
::
ElementwiseUnary
<
MT
,
T
,
kCount
,
1
,
TransformOp
>
(
&
result
[
0
],
&
args
[
0
],
trans
);
kps
::
WriteData
<
T
,
T
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
T
,
T
,
kCount
,
1
,
true
>
(
out_data
+
i
,
&
result
[
0
],
size
-
i
,
1
,
stride
,
1
);
__syncthreads
();
}
...
...
paddle/phi/kernels/funcs/elementwise_base.h
浏览文件 @
1a0cd447
...
...
@@ -519,13 +519,13 @@ struct Loader {
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
),
read_lens
);
if
(
is_boundary
)
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
true
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
ArgsT
,
Index
,
true
>
(
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
num
,
read_lens
);
}
else
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
false
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
ArgsT
,
Index
,
false
>
(
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
num
,
...
...
@@ -595,7 +595,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseAny
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Arity
,
Functor
>
(
kps
::
ElementwiseAny
<
InT
,
OutT
,
VecSize
,
1
,
Arity
,
Functor
>
(
result
,
args
,
func
);
}
};
...
...
@@ -606,7 +606,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseConstant
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
result
,
func
);
kps
::
ElementwiseConstant
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
func
);
}
};
...
...
@@ -616,7 +616,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
func
);
}
};
...
...
@@ -627,7 +627,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
args
[
1
],
func
,
read_lens
);
}
};
...
...
@@ -638,7 +638,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
int
read_lens
)
{
kps
::
ElementwiseTernary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseTernary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
args
[
1
],
args
[
2
],
func
);
}
};
...
...
@@ -703,7 +703,7 @@ struct ElementwiseWriteDataCallerBc {
}
#pragma unroll
for
(
int
i
=
0
;
i
<
NumOuts
;
++
i
)
{
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
outs
[
i
]
+
block_offset
,
dst
[
i
],
num
,
read_lens
);
}
}
...
...
@@ -716,7 +716,7 @@ struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
kps
::
IndexType
block_offset
,
int
num
,
int
read_lens
)
{
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
outs
[
0
]
+
block_offset
,
src
,
num
,
read_lens
);
}
};
...
...
paddle/phi/kernels/funcs/index_impl.cu.h
浏览文件 @
1a0cd447
...
...
@@ -36,18 +36,18 @@ __global__ void VectorizedIndexKernel(T *out,
size_t
args
[
VecSize
];
T
result
[
VecSize
];
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
args
[
0
],
func
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
1
,
false
>
(
kps
::
WriteData
<
T
,
VecSize
,
1
,
false
>
(
out
+
data_offset
,
&
result
[
0
],
BLOCK_NUM_X
*
VecSize
);
}
size_t
num
=
numel
-
data_offset
;
if
(
num
>
0
)
{
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
args
[
0
],
func
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
1
,
true
>
(
out
+
data_offset
,
&
result
[
0
],
num
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
true
>
(
out
+
data_offset
,
&
result
[
0
],
num
);
}
}
...
...
paddle/phi/kernels/funcs/reduce_function.h
浏览文件 @
1a0cd447
...
...
@@ -712,7 +712,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1
,
REDUCE_VEC_SIZE
,
1
,
1
,
Calculator
,
kps
::
IdentityFunctor
<
Tx
>
,
false
>
(
&
input_reg
[
0
],
...
...
@@ -725,12 +724,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
stride
,
kps
::
IdentityFunctor
<
Tx
>
(),
reduce_last_dim
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
REDUCE_VEC_SIZE
,
1
,
TransformOp
>
(
&
input_compute
[
0
],
&
input_reg
[
0
],
transformer
);
kps
::
Reduce
<
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
...
...
@@ -742,7 +740,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1
,
REDUCE_VEC_SIZE
,
1
,
1
,
Calculator
,
TransformOp
,
true
>
(
&
input_compute
[
0
],
...
...
@@ -758,12 +755,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps
::
Reduce
<
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
&
reduce_var
,
&
reduce_var
,
reducer
,
reduce_last_dim
);
if
(
is_mean
)
{
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
reduce_num
);
...
...
@@ -807,27 +803,22 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType
reduce_var
=
init
;
MPType
reduce_compute
=
init
;
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
1
,
false
>
(
&
reduce_input
,
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
false
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
block
.
BlockDimX
(),
1
,
1
,
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
TransformOp
>
(
&
reduce_compute
,
&
reduce_input
,
transformer
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
}
if
(
is_mean
)
{
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
WriteData
<
Ty
,
1
,
1
,
1
,
false
>
(
kps
::
WriteData
<
Ty
,
1
,
1
,
false
>
(
y
+
store_offset
+
idx
,
&
result
,
block
.
BlockDimX
());
}
...
...
@@ -835,20 +826,15 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType
reduce_var
=
init
;
MPType
reduce_compute
=
init
;
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
1
,
true
>
(
&
reduce_input
,
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
true
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
dim
.
rem_x
,
1
,
1
,
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
TransformOp
>
(
&
reduce_compute
,
&
reduce_input
,
transformer
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
}
...
...
@@ -856,8 +842,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
WriteData
<
Ty
,
1
,
1
,
1
,
true
>
(
y
+
store_offset
+
idx
,
&
result
,
dim
.
rem_x
);
kps
::
WriteData
<
Ty
,
1
,
1
,
true
>
(
y
+
store_offset
+
idx
,
&
result
,
dim
.
rem_x
);
}
}
...
...
paddle/phi/kernels/funcs/select_impl.cu.h
浏览文件 @
1a0cd447
...
...
@@ -71,21 +71,21 @@ __device__ void GetBlockCountImpl(const InT *in,
int
store_fix
=
BLOCK_ID_X
+
repeat
*
GRID_NUM_X
;
kps
::
Init
<
InT
,
VecSize
>
(
&
in_data
[
0
],
static_cast
<
InT
>
(
0.0
f
));
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Cast
>
(
kps
::
ReadData
<
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
Cast
>
(
&
temp
[
0
],
&
in_data
[
0
],
Cast
());
kps
::
Reduce
<
OutT
,
VecSize
,
1
,
1
,
Add
,
Mode
::
kLocalMode
>
(
kps
::
Reduce
<
OutT
,
VecSize
,
1
,
Add
,
Mode
::
kLocalMode
>
(
&
result
,
&
temp
[
0
],
Add
(),
true
);
kps
::
Reduce
<
OutT
,
1
,
1
,
1
,
Add
,
Mode
::
kGlobalMode
>
(
kps
::
Reduce
<
OutT
,
1
,
1
,
Add
,
Mode
::
kGlobalMode
>
(
&
result
,
&
result
,
Add
(),
true
);
if
(
store_fix
==
0
)
{
// first block's fix_size = 0;
OutT
tmp
=
static_cast
<
OutT
>
(
0.0
f
);
kps
::
WriteData
<
OutT
,
1
,
1
,
1
,
true
>
(
out
+
store_fix
,
&
tmp
,
1
);
kps
::
WriteData
<
OutT
,
1
,
1
,
true
>
(
out
+
store_fix
,
&
tmp
,
1
);
}
// store num of this block
kps
::
WriteData
<
OutT
,
1
,
1
,
1
,
true
>
(
out
+
store_fix
+
1
,
&
result
,
1
);
kps
::
WriteData
<
OutT
,
1
,
1
,
true
>
(
out
+
store_fix
+
1
,
&
result
,
1
);
}
// Count how many data is not zero in current block
...
...
@@ -132,12 +132,12 @@ __device__ void CumsumImpl(
// set pre_cumsum
kps
::
Init
<
OutT
,
VecSize
>
(
&
temp
[
0
],
*
pre_cumsum
);
// load data to arg
kps
::
ReadData
<
InT
,
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadData
<
InT
,
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
arg
[
0
],
in
,
num
,
1
,
BLOCK_NUM_X
,
1
);
// block cumsum
kps
::
Cumsum
<
InT
,
OutT
,
1
,
Functor
>
(
&
result
[
0
],
&
arg
[
0
],
func
);
kps
::
Cumsum
<
InT
,
OutT
,
Functor
>
(
&
result
[
0
],
&
arg
[
0
],
func
);
// result = cumsum_result + pre_cumsum
kps
::
ElementwiseBinary
<
OutT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseBinary
<
OutT
,
OutT
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
result
[
0
],
&
temp
[
0
],
func
);
// get the last prefix sum
if
((
THREAD_ID_X
==
BLOCK_NUM_X
-
1
)
&&
!
IsBoundary
)
{
...
...
@@ -146,7 +146,7 @@ __device__ void CumsumImpl(
__syncthreads
();
// update pre_cumsum
*
pre_cumsum
=
max_thread_data
;
kps
::
WriteData
<
OutT
,
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
OutT
,
VecSize
,
1
,
IsBoundary
>
(
out
,
&
result
[
0
],
num
,
1
,
BLOCK_NUM_X
,
1
);
}
...
...
@@ -189,7 +189,7 @@ struct SelectCaller {
int64_t
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
// set index
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
,
1
>
(
&
in_data
[
0
],
data_offset
);
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
>
(
&
in_data
[
0
],
data_offset
);
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
int64_t
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
...
...
@@ -215,7 +215,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
int
num
)
{
InT
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ReadData
<
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
...
...
@@ -244,7 +244,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
kps
::
details
::
ReadData
<
InT
>
(
&
in_data
[
0
],
in
+
thread_fix
,
store_num
);
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
}
};
...
...
@@ -285,16 +285,16 @@ __device__ void SelectKernelImpl(OutT *out,
kps
::
Init
<
IdT
,
kCVecSize
>
(
&
num_thread
[
0
],
init_idx
);
kps
::
Init
<
MT
,
VecSize
>
(
&
mask_data
[
0
],
init_mask
);
// Load mask
kps
::
ReadData
<
MT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
mask_data
[
0
],
mask
,
num
);
kps
::
ReadData
<
MT
,
VecSize
,
1
,
IsBoundary
>
(
&
mask_data
[
0
],
mask
,
num
);
// Cast from MT to int
kps
::
ElementwiseUnary
<
MT
,
IdT
,
VecSize
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
MT
,
IdT
,
VecSize
,
1
,
Cast
>
(
&
mask_idt
[
0
],
&
mask_data
[
0
],
Cast
());
// Get the num of thread only num_thread[1] has data
kps
::
Reduce
<
IdT
,
VecSize
,
1
,
1
,
Add
,
Mode
::
kLocalMode
>
(
kps
::
Reduce
<
IdT
,
VecSize
,
1
,
Add
,
Mode
::
kLocalMode
>
(
&
num_thread
[
0
],
&
mask_idt
[
0
],
Add
(),
true
);
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// thread_fix
kps
::
Cumsum
<
IdT
,
IdT
,
1
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
kps
::
Cumsum
<
IdT
,
IdT
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
// get thread_fix
int
thread_fix
=
(
static_cast
<
int
>
(
cumsum_thread
[
0
]
-
num_thread
[
0
])
*
store_rank
);
...
...
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
浏览文件 @
1a0cd447
...
...
@@ -311,9 +311,9 @@ __global__ void WarpSoftmaxForward(T* softmax,
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
src_data
[
i
][
0
][
0
]);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
reg_v
[
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
kps
::
ElementwiseUnary
<
T
,
AccT
,
kVItem
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kVItem
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
src_data
[
i
][
0
][
0
],
DataTransFunctor
<
T
,
AccT
>
());
}
...
...
@@ -321,7 +321,6 @@ __global__ void WarpSoftmaxForward(T* softmax,
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
ReduceMaxFunctor
<
AccT
>
,
kMode
::
kLocalMode
>
(
&
max
[
0
],
&
sub_data
[
0
][
0
][
0
],
ReduceMaxFunctor
<
AccT
>
(),
true
);
...
...
@@ -330,15 +329,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
// compute sum
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
max
[
i
]));
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
ExpFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpFunctor
<
AccT
>>
(
&
exp_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
ExpFunctor
<
AccT
>
());
}
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kLocalMode
>
(
&
sum
[
0
],
&
exp_data
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
...
...
@@ -351,15 +349,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
reinterpret_cast
<
VecT
*>
(
&
softmax
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
if
(
LogMode
)
{
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
std
::
log
(
sum
[
i
])));
}
else
{
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
1
,
UnaryDivFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
UnaryDivFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
exp_data
[
i
][
0
][
0
],
UnaryDivFunctor
<
AccT
>
(
sum
[
i
]));
}
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
softmax_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
}
}
...
...
@@ -417,9 +415,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
int
ptr
=
(
first_batch
+
i
)
*
stride
;
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[
ptr
]);
const
VecT
*
grad_v
=
reinterpret_cast
<
const
VecT
*>
(
&
grad
[
ptr
]);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
src_reg
[
i
][
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
grad_reg
[
i
][
0
],
&
grad_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
}
...
...
@@ -430,9 +428,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
const
T
*
grad_ptr
=
reinterpret_cast
<
const
T
*>
(
&
grad_reg
[
0
][
0
]);
constexpr
int
kStep
=
kBatchSize
*
kLoopsV
*
kVSize
;
constexpr
int
kVItem
=
kLoopsV
*
kVSize
;
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
src_tmp
[
0
][
0
][
0
],
&
src_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
grad_tmp
[
0
][
0
][
0
],
&
grad_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
// compute sum
...
...
@@ -444,17 +442,15 @@ __global__ void WarpSoftmaxBackward(T* dst,
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
sum
[
0
],
&
grad_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
}
else
{
kps
::
ElementwiseBinary
<
AccT
,
AccT
,
kStep
,
1
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
AccT
,
kStep
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
&
sum_tmp
[
0
][
0
][
0
],
&
gradptr
[
0
],
&
srcptr
[
0
],
kps
::
MulFunctor
<
AccT
>
());
kps
::
Reduce
<
AccT
,
kVItem
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
sum
[
0
],
&
sum_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
...
...
@@ -470,17 +466,17 @@ __global__ void WarpSoftmaxBackward(T* dst,
AccT
*
gradptr
=
reinterpret_cast
<
AccT
*>
(
&
grad_tmp
[
i
][
0
][
0
]);
AccT
*
srcptr
=
reinterpret_cast
<
AccT
*>
(
&
src_tmp
[
i
][
0
][
0
]);
if
(
LogMode
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
ExpMulFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpMulFunctor
<
AccT
>>
(
&
out
[
i
][
0
][
0
],
&
srcptr
[
0
],
ExpMulFunctor
<
AccT
>
(
sum
[
i
]));
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
1
,
kps
::
SubFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
kps
::
SubFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
gradptr
[
0
],
&
out
[
i
][
0
][
0
],
kps
::
SubFunctor
<
AccT
>
());
}
else
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
out
[
i
][
0
][
0
],
&
gradptr
[
0
],
UnarySubFunctor
<
AccT
>
(
sum
[
i
]));
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
srcptr
[
0
],
&
out
[
i
][
0
][
0
],
...
...
@@ -488,7 +484,7 @@ __global__ void WarpSoftmaxBackward(T* dst,
}
VecT
*
dst_v
=
reinterpret_cast
<
VecT
*>
(
&
dst
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
dst_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
}
}
...
...
@@ -636,7 +632,7 @@ __global__ void NormalSoftmaxForward(
}
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
MaxFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
MaxFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
max_value
,
&
max_value
,
kps
::
MaxFunctor
<
AccT
>
(),
false
);
}
...
...
@@ -647,7 +643,7 @@ __global__ void NormalSoftmaxForward(
sum
+=
std
::
exp
(
value
-
max_value
);
}
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
}
...
...
@@ -695,7 +691,7 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
}
}
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
}
...
...
paddle/phi/kernels/primitive/compute_primitives.h
浏览文件 @
1a0cd447
...
...
@@ -200,7 +200,6 @@ __device__ inline int GetLastPow2(int n) {
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
...
...
@@ -215,12 +214,7 @@ __device__ inline int GetLastPow2(int n) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
...
...
@@ -239,7 +233,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
...
...
@@ -255,12 +248,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -271,12 +259,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
#pragma unroll
...
...
@@ -294,7 +277,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -312,12 +294,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
...
...
@@ -335,7 +312,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Arity: The size of ins.
* OpFunc: Compute functor which has an operator() as following:
...
...
@@ -351,13 +327,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
Arity
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
...
...
@@ -382,7 +352,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
...
...
@@ -398,12 +367,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -428,7 +392,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -448,7 +411,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
...
...
@@ -494,7 +456,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -509,12 +470,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
...
@@ -532,7 +488,6 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* hiprandStatePhilox4_32_10_t.
* OutT: the type of out register.
* ReturnsCount: The number of random data generated by OpFunc.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
...
...
@@ -549,11 +504,7 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* compute: Compute function which was declared like OpFunc<T>().
*/
template
<
typename
StateType
,
typename
OutT
,
int
ReturnsCount
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
StateType
,
typename
OutT
,
int
ReturnsCount
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseRandom
(
OutT
*
out
,
OpFunc
compute
,
StateType
*
state
)
{
...
...
@@ -571,7 +522,6 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
* @template paraments
* InT: the type of input register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
...
...
@@ -589,7 +539,7 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
*/
#define SHARED_SIZE_LIMIT 512
template
<
typename
InT
,
typename
OutT
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
class
OpFunc
>
__device__
__forceinline__
void
Cumsum
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
...
...
@@ -632,7 +582,6 @@ __device__ __forceinline__ void Cumsum(OutT* out,
* @template paraments
* InT: the type of input register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
*
* @param
...
...
@@ -645,7 +594,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
#define SHARED_SIZE_LIMIT 1024
// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
// larger than blockDim.x * 2
template
<
typename
InT
,
typename
OutT
,
int
BlockSize
>
template
<
typename
InT
,
typename
OutT
>
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
const
InT
*
in
,
int
num
,
...
...
@@ -689,7 +638,6 @@ __device__ __forceinline__ void Sort(OutT* out,
* InT: The type of input register.
* OutT: The type of out register.
* IndexType: The type of index.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
*
* @param
...
...
@@ -701,7 +649,7 @@ __device__ __forceinline__ void Sort(OutT* out,
* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
* sorted in escending.
*/
template
<
typename
InT
,
typename
OutT
,
typename
IndexType
,
int
BlockSize
>
template
<
typename
InT
,
typename
OutT
,
typename
IndexType
>
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
IndexType
*
out_index
,
const
InT
*
in
,
...
...
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
浏览文件 @
1a0cd447
...
...
@@ -89,7 +89,6 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
...
...
@@ -104,12 +103,7 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
...
...
@@ -128,7 +122,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
...
...
@@ -144,12 +137,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -160,12 +148,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
...
...
@@ -182,7 +165,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -200,12 +182,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
...
...
@@ -223,7 +200,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Arity: The size of ins
* OpFunc: Compute functor which has an operator() as following:
...
...
@@ -239,13 +215,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
Arity
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
...
...
@@ -270,7 +240,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
...
...
@@ -286,12 +255,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
...
...
@@ -316,7 +280,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -336,7 +299,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
...
...
@@ -369,7 +331,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
...
...
@@ -384,12 +345,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
1a0cd447
...
...
@@ -144,7 +144,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -161,12 +160,7 @@ __device__ __forceinline__ void ReadData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
size_nx
,
...
...
@@ -275,7 +269,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -287,7 +280,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
)
{
...
...
@@ -319,7 +312,7 @@ __device__ __forceinline__ void ReadData(T* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
,
...
...
@@ -361,7 +354,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -376,7 +368,6 @@ __device__ __forceinline__ void ReadData(T* dst,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
typename
ArgsT
,
int
Index
,
bool
IsBoundary
=
false
>
...
...
@@ -419,7 +410,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -437,7 +427,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
...
...
@@ -479,7 +469,6 @@ __device__ __forceinline__ void ReadDataBc(
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -507,7 +496,6 @@ template <typename Tx,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
typename
Functor
,
...
...
@@ -572,7 +560,6 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -584,7 +571,7 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* src: The register pointer, the size is NX * NY.
* size: The current block needs to load size elements continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
)
{
...
...
@@ -613,7 +600,7 @@ __device__ __forceinline__ void WriteData(T* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
,
...
...
@@ -652,7 +639,6 @@ __device__ __forceinline__ void WriteData(T* dst,
* Ty: The type of data that stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -669,12 +655,7 @@ __device__ __forceinline__ void WriteData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
size_nx
,
...
...
@@ -766,7 +747,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -782,7 +762,7 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
...
...
@@ -820,14 +800,13 @@ __device__ __forceinline__ void ReadDataBc(
* T: Data type of register.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
>
template
<
typename
T
,
int
NX
,
int
NY
>
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
int
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
#pragma unroll
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
1a0cd447
...
...
@@ -337,7 +337,6 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -354,12 +353,7 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
const
Tx
_global_ptr_
*
src
,
int
size_nx
,
...
...
@@ -472,7 +466,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -484,7 +477,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
)
{
...
...
@@ -502,7 +495,7 @@ __device__ __inline__ void ReadData(T* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
,
...
...
@@ -531,7 +524,6 @@ __device__ __inline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
...
...
@@ -546,7 +538,6 @@ __device__ __inline__ void ReadData(T* dst,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
typename
ArgsT
,
int
Index
,
bool
IsBoundary
>
...
...
@@ -582,7 +573,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -599,7 +589,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
...
...
@@ -634,7 +624,6 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
...
@@ -662,7 +651,6 @@ template <typename Tx,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
typename
Functor
,
...
...
@@ -733,7 +721,6 @@ __device__ __forceinline__ void ReadDataReduce(
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -746,7 +733,7 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
int
num
,
...
...
@@ -766,7 +753,7 @@ __device__ void WriteData(T _global_ptr_* dst,
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
int
num
)
{
int
thread_offset
=
core_id
()
*
NX
;
mfence_local
();
...
...
@@ -793,7 +780,6 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* Ty: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -810,12 +796,7 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
const
Tx
*
src
,
int
size_nx
,
...
...
@@ -1190,7 +1171,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
...
...
@@ -1206,7 +1186,7 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* read_lens: The number of data continuously loaded by each thread.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
...
...
@@ -1238,14 +1218,13 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: Data type of register.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
>
template
<
typename
T
,
int
NX
,
int
NY
>
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
int
thread_offset
=
block_offset
+
core_id
()
*
NX
;
#pragma unroll
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录