Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
1a0cd447
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1a0cd447
编写于
8月 23, 2022
作者:
N
niuliling123
提交者:
GitHub
8月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Delete the template parameter BLockSize in Kernel Primitive API (#45220)
上级
3a7b1810
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
136 addition
and
295 deletion
+136
-295
paddle/fluid/operators/dropout_impl.cu.h
paddle/fluid/operators/dropout_impl.cu.h
+18
-18
paddle/fluid/operators/fused/attn_bias_add.cu.h
paddle/fluid/operators/fused/attn_bias_add.cu.h
+5
-6
paddle/phi/kernels/funcs/broadcast_function.h
paddle/phi/kernels/funcs/broadcast_function.h
+2
-2
paddle/phi/kernels/funcs/distribution_helper.h
paddle/phi/kernels/funcs/distribution_helper.h
+3
-4
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+9
-9
paddle/phi/kernels/funcs/index_impl.cu.h
paddle/phi/kernels/funcs/index_impl.cu.h
+6
-6
paddle/phi/kernels/funcs/reduce_function.h
paddle/phi/kernels/funcs/reduce_function.h
+20
-35
paddle/phi/kernels/funcs/select_impl.cu.h
paddle/phi/kernels/funcs/select_impl.cu.h
+17
-17
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+20
-24
paddle/phi/kernels/primitive/compute_primitives.h
paddle/phi/kernels/primitive/compute_primitives.h
+11
-63
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+7
-51
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+9
-30
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+9
-30
未找到文件。
paddle/fluid/operators/dropout_impl.cu.h
浏览文件 @
1a0cd447
...
@@ -112,17 +112,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
...
@@ -112,17 +112,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
auto
dst_functor
=
auto
dst_functor
=
DstMaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
,
is_upscale_in_train
);
DstMaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
,
is_upscale_in_train
);
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ReadData
<
T
,
kCount
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
&
rands
[
0
],
Rand
(),
&
state
);
// dst
// dst
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
kps
::
WriteData
<
T
,
kCount
,
1
,
1
,
false
>
(
dst
+
fix
,
&
dst_mask
[
0
],
deal_size
);
kps
::
WriteData
<
T
,
kCount
,
1
,
false
>
(
dst
+
fix
,
&
dst_mask
[
0
],
deal_size
);
// mask
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
false
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
false
>
(
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
if
(
fix
>
idx
*
kCount
+
1
)
{
if
(
fix
>
idx
*
kCount
+
1
)
{
__syncthreads
();
__syncthreads
();
...
@@ -130,17 +130,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
...
@@ -130,17 +130,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
}
}
int
remainder
=
n
-
fix
;
int
remainder
=
n
-
fix
;
if
(
remainder
>
0
)
{
if
(
remainder
>
0
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ReadData
<
T
,
kCount
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
&
rands
[
0
],
Rand
(),
&
state
);
// dst
// dst
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
kps
::
OperatorTernary
<
T
,
float
,
T
,
DstMaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
&
dst_mask
[
0
],
&
dst_mask
[
0
],
&
rands
[
0
],
dst_functor
,
kCount
);
kps
::
WriteData
<
T
,
kCount
,
1
,
1
,
true
>
(
dst
+
fix
,
&
dst_mask
[
0
],
remainder
);
kps
::
WriteData
<
T
,
kCount
,
1
,
true
>
(
dst
+
fix
,
&
dst_mask
[
0
],
remainder
);
// mask
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
&
mask_result
[
0
],
&
dst_mask
[
kCount
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
true
>
(
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
__syncthreads
();
__syncthreads
();
}
}
...
@@ -233,17 +233,17 @@ __global__ void VectorizedGeneratorMask(const size_t n,
...
@@ -233,17 +233,17 @@ __global__ void VectorizedGeneratorMask(const size_t n,
auto
mask_functor
=
MaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
);
auto
mask_functor
=
MaskFunctor
<
T
,
float
>
(
1.0
f
-
dropout_prob
);
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
for
(;
fix
<
main_offset
;
fix
+=
stride
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ReadData
<
T
,
kCount
,
1
,
false
>
(
&
dst_mask
[
0
],
src
+
fix
,
deal_size
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
&
rands
[
0
],
Rand
(),
&
state
);
// dst
// dst
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
// mask
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
false
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
false
>
(
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
mask
+
fix
,
&
mask_result
[
0
],
deal_size
);
if
(
fix
>
idx
*
kCount
+
1
)
{
if
(
fix
>
idx
*
kCount
+
1
)
{
__syncthreads
();
__syncthreads
();
...
@@ -251,16 +251,16 @@ __global__ void VectorizedGeneratorMask(const size_t n,
...
@@ -251,16 +251,16 @@ __global__ void VectorizedGeneratorMask(const size_t n,
}
}
int
remainder
=
n
-
fix
;
int
remainder
=
n
-
fix
;
if
(
remainder
>
0
)
{
if
(
remainder
>
0
)
{
kps
::
ReadData
<
T
,
kCount
,
1
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ReadData
<
T
,
kCount
,
1
,
true
>
(
&
dst_mask
[
0
],
src
+
fix
,
remainder
);
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
1
,
Rand
>
(
kps
::
ElementwiseRandom
<
SType
,
float
,
kCount
,
Rand
>
(
&
rands
[
0
],
Rand
(),
&
state
);
&
rands
[
0
],
Rand
(),
&
state
);
// dst
// dst
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
kps
::
OperatorBinary
<
float
,
T
,
MaskFunctor
<
T
,
float
>>
(
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
&
dst_mask
[
0
],
&
rands
[
0
],
mask_functor
,
kCount
);
// mask
// mask
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
T
,
MaskType
,
kCount
,
1
,
Cast
>
(
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
&
mask_result
[
0
],
&
dst_mask
[
0
],
Cast
());
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
MaskType
,
kCount
,
1
,
true
>
(
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
mask
+
fix
,
&
mask_result
[
0
],
remainder
);
__syncthreads
();
__syncthreads
();
}
}
...
...
paddle/fluid/operators/fused/attn_bias_add.cu.h
浏览文件 @
1a0cd447
...
@@ -73,24 +73,23 @@ __global__ void BroadcastKernelBinary(
...
@@ -73,24 +73,23 @@ __global__ void BroadcastKernelBinary(
// load in0
// load in0
if
(
use_broadcast
[
0
])
{
if
(
use_broadcast
[
0
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
>
(
arg0
,
in0
,
fix
,
configlists
[
0
],
numel
);
arg0
,
in0
,
fix
,
configlists
[
0
],
numel
);
}
else
{
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg0
,
in0
+
fix
,
num
);
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg0
,
in0
+
fix
,
num
);
}
}
// load in1
// load in1
if
(
use_broadcast
[
1
])
{
if
(
use_broadcast
[
1
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
>
(
arg1
,
in1
,
fix
,
configlists
[
1
],
numel
);
arg1
,
in1
,
fix
,
configlists
[
1
],
numel
);
}
else
{
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg1
,
in1
+
fix
,
num
);
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
>
(
arg1
,
in1
+
fix
,
num
);
}
}
// compute
// compute
kernel_primitives
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kernel_primitives
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
arg0
,
arg1
,
func
);
result
,
arg0
,
arg1
,
func
);
// store
// store
kernel_primitives
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
true
>
(
kernel_primitives
::
WriteData
<
OutT
,
VecSize
,
1
,
true
>
(
out
+
fix
,
result
,
num
);
out
+
fix
,
result
,
num
);
}
}
// bias add forward impl for "[m, n] + [n] = [m, n]"
// bias add forward impl for "[m, n] + [n] = [m, n]"
...
...
paddle/phi/kernels/funcs/broadcast_function.h
浏览文件 @
1a0cd447
...
@@ -266,10 +266,10 @@ __device__ __forceinline__ void LoadData(
...
@@ -266,10 +266,10 @@ __device__ __forceinline__ void LoadData(
// numel : whole num of output
// numel : whole num of output
// num: how many data will be deal with in this time
// num: how many data will be deal with in this time
if
(
need_broadcast
)
{
if
(
need_broadcast
)
{
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
IsBoundary
>
(
dst
,
src
,
block_offset
,
config
,
numel
,
read_lens
);
dst
,
src
,
block_offset
,
config
,
numel
,
read_lens
);
}
else
{
}
else
{
kps
::
ReadData
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadData
<
T
,
VecSize
,
1
,
IsBoundary
>
(
dst
,
src
+
block_offset
,
num
,
read_lens
);
dst
,
src
+
block_offset
,
num
,
read_lens
);
}
}
}
}
...
...
paddle/phi/kernels/funcs/distribution_helper.h
浏览文件 @
1a0cd447
...
@@ -278,11 +278,10 @@ __global__ void DistributionKernel(size_t size,
...
@@ -278,11 +278,10 @@ __global__ void DistributionKernel(size_t size,
MT
args
[
kCount
];
MT
args
[
kCount
];
T
result
[
kCount
];
T
result
[
kCount
];
for
(
size_t
i
=
idx
;
i
<
size
;
i
+=
total_thread
*
kCount
)
{
for
(
size_t
i
=
idx
;
i
<
size
;
i
+=
total_thread
*
kCount
)
{
kps
::
ElementwiseRandom
<
SType
,
MT
,
kCount
,
1
,
DistOp
>
(
kps
::
ElementwiseRandom
<
SType
,
MT
,
kCount
,
DistOp
>
(
&
args
[
0
],
dist
,
&
state
);
&
args
[
0
],
dist
,
&
state
);
kps
::
ElementwiseUnary
<
MT
,
T
,
kCount
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
MT
,
T
,
kCount
,
1
,
1
,
TransformOp
>
(
&
result
[
0
],
&
args
[
0
],
trans
);
&
result
[
0
],
&
args
[
0
],
trans
);
kps
::
WriteData
<
T
,
T
,
kCount
,
1
,
1
,
true
>
(
kps
::
WriteData
<
T
,
T
,
kCount
,
1
,
true
>
(
out_data
+
i
,
&
result
[
0
],
size
-
i
,
1
,
stride
,
1
);
out_data
+
i
,
&
result
[
0
],
size
-
i
,
1
,
stride
,
1
);
__syncthreads
();
__syncthreads
();
}
}
...
...
paddle/phi/kernels/funcs/elementwise_base.h
浏览文件 @
1a0cd447
...
@@ -519,13 +519,13 @@ struct Loader {
...
@@ -519,13 +519,13 @@ struct Loader {
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
),
read_lens
);
args
,
static_cast
<
Type
>
(
1.0
f
),
read_lens
);
if
(
is_boundary
)
{
if
(
is_boundary
)
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
true
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
ArgsT
,
Index
,
true
>
(
args
,
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
num
,
num
,
read_lens
);
read_lens
);
}
else
{
}
else
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
false
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
ArgsT
,
Index
,
false
>
(
args
,
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
offset
,
num
,
num
,
...
@@ -595,7 +595,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
...
@@ -595,7 +595,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
InT
(
*
args
)[
VecSize
],
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
OutT
*
result
,
int
read_lens
)
{
int
read_lens
)
{
kps
::
ElementwiseAny
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Arity
,
Functor
>
(
kps
::
ElementwiseAny
<
InT
,
OutT
,
VecSize
,
1
,
Arity
,
Functor
>
(
result
,
args
,
func
);
result
,
args
,
func
);
}
}
};
};
...
@@ -606,7 +606,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
...
@@ -606,7 +606,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
InT
(
*
args
)[
VecSize
],
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
OutT
*
result
,
int
read_lens
)
{
int
read_lens
)
{
kps
::
ElementwiseConstant
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
result
,
func
);
kps
::
ElementwiseConstant
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
func
);
}
}
};
};
...
@@ -616,7 +616,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
...
@@ -616,7 +616,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
InT
(
*
args
)[
VecSize
],
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
OutT
*
result
,
int
read_lens
)
{
int
read_lens
)
{
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
func
);
result
,
args
[
0
],
func
);
}
}
};
};
...
@@ -627,7 +627,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
...
@@ -627,7 +627,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
InT
(
*
args
)[
VecSize
],
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
OutT
*
result
,
int
read_lens
)
{
int
read_lens
)
{
kps
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseBinary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
args
[
1
],
func
,
read_lens
);
result
,
args
[
0
],
args
[
1
],
func
,
read_lens
);
}
}
};
};
...
@@ -638,7 +638,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
...
@@ -638,7 +638,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
InT
(
*
args
)[
VecSize
],
InT
(
*
args
)[
VecSize
],
OutT
*
result
,
OutT
*
result
,
int
read_lens
)
{
int
read_lens
)
{
kps
::
ElementwiseTernary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseTernary
<
InT
,
OutT
,
VecSize
,
1
,
Functor
>
(
result
,
args
[
0
],
args
[
1
],
args
[
2
],
func
);
result
,
args
[
0
],
args
[
1
],
args
[
2
],
func
);
}
}
};
};
...
@@ -703,7 +703,7 @@ struct ElementwiseWriteDataCallerBc {
...
@@ -703,7 +703,7 @@ struct ElementwiseWriteDataCallerBc {
}
}
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
NumOuts
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NumOuts
;
++
i
)
{
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
outs
[
i
]
+
block_offset
,
dst
[
i
],
num
,
read_lens
);
outs
[
i
]
+
block_offset
,
dst
[
i
],
num
,
read_lens
);
}
}
}
}
...
@@ -716,7 +716,7 @@ struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
...
@@ -716,7 +716,7 @@ struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
kps
::
IndexType
block_offset
,
kps
::
IndexType
block_offset
,
int
num
,
int
num
,
int
read_lens
)
{
int
read_lens
)
{
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
outs
[
0
]
+
block_offset
,
src
,
num
,
read_lens
);
outs
[
0
]
+
block_offset
,
src
,
num
,
read_lens
);
}
}
};
};
...
...
paddle/phi/kernels/funcs/index_impl.cu.h
浏览文件 @
1a0cd447
...
@@ -36,18 +36,18 @@ __global__ void VectorizedIndexKernel(T *out,
...
@@ -36,18 +36,18 @@ __global__ void VectorizedIndexKernel(T *out,
size_t
args
[
VecSize
];
size_t
args
[
VecSize
];
T
result
[
VecSize
];
T
result
[
VecSize
];
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
args
[
0
],
func
);
&
result
[
0
],
&
args
[
0
],
func
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
1
,
false
>
(
kps
::
WriteData
<
T
,
VecSize
,
1
,
false
>
(
out
+
data_offset
,
&
result
[
0
],
BLOCK_NUM_X
*
VecSize
);
out
+
data_offset
,
&
result
[
0
],
BLOCK_NUM_X
*
VecSize
);
}
}
size_t
num
=
numel
-
data_offset
;
size_t
num
=
numel
-
data_offset
;
if
(
num
>
0
)
{
if
(
num
>
0
)
{
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
InitWithDataIndex
<
size_t
,
VecSize
,
1
>
(
&
args
[
0
],
data_offset
);
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseUnary
<
size_t
,
T
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
args
[
0
],
func
);
&
result
[
0
],
&
args
[
0
],
func
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
1
,
true
>
(
out
+
data_offset
,
&
result
[
0
],
num
);
kps
::
WriteData
<
T
,
VecSize
,
1
,
true
>
(
out
+
data_offset
,
&
result
[
0
],
num
);
}
}
}
}
...
...
paddle/phi/kernels/funcs/reduce_function.h
浏览文件 @
1a0cd447
...
@@ -712,7 +712,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
...
@@ -712,7 +712,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1
,
1
,
REDUCE_VEC_SIZE
,
REDUCE_VEC_SIZE
,
1
,
1
,
1
,
Calculator
,
Calculator
,
kps
::
IdentityFunctor
<
Tx
>
,
kps
::
IdentityFunctor
<
Tx
>
,
false
>
(
&
input_reg
[
0
],
false
>
(
&
input_reg
[
0
],
...
@@ -725,12 +724,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
...
@@ -725,12 +724,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
stride
,
stride
,
kps
::
IdentityFunctor
<
Tx
>
(),
kps
::
IdentityFunctor
<
Tx
>
(),
reduce_last_dim
);
reduce_last_dim
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
REDUCE_VEC_SIZE
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
REDUCE_VEC_SIZE
,
1
,
TransformOp
>
(
&
input_compute
[
0
],
&
input_reg
[
0
],
transformer
);
&
input_compute
[
0
],
&
input_reg
[
0
],
transformer
);
kps
::
Reduce
<
MPType
,
kps
::
Reduce
<
MPType
,
REDUCE_VEC_SIZE
,
REDUCE_VEC_SIZE
,
1
,
1
,
1
,
ReduceOp
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
...
@@ -742,7 +740,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
...
@@ -742,7 +740,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1
,
1
,
REDUCE_VEC_SIZE
,
REDUCE_VEC_SIZE
,
1
,
1
,
1
,
Calculator
,
Calculator
,
TransformOp
,
TransformOp
,
true
>
(
&
input_compute
[
0
],
true
>
(
&
input_compute
[
0
],
...
@@ -758,12 +755,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
...
@@ -758,12 +755,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps
::
Reduce
<
MPType
,
kps
::
Reduce
<
MPType
,
REDUCE_VEC_SIZE
,
REDUCE_VEC_SIZE
,
1
,
1
,
1
,
ReduceOp
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
&
reduce_var
,
&
input_compute
[
0
],
reducer
,
reduce_last_dim
);
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
&
reduce_var
,
&
reduce_var
,
reducer
,
reduce_last_dim
);
&
reduce_var
,
&
reduce_var
,
reducer
,
reduce_last_dim
);
if
(
is_mean
)
{
if
(
is_mean
)
{
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
reduce_num
);
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
reduce_num
);
...
@@ -807,27 +803,22 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
...
@@ -807,27 +803,22 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType
reduce_var
=
init
;
MPType
reduce_var
=
init
;
MPType
reduce_compute
=
init
;
MPType
reduce_compute
=
init
;
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
1
,
false
>
(
&
reduce_input
,
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
false
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
input
+
loop_idx
*
left_num
+
idx
,
block
.
BlockDimX
(),
block
.
BlockDimX
(),
1
,
1
,
1
,
1
,
left_num
);
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
TransformOp
>
(
&
reduce_compute
,
&
reduce_input
,
transformer
);
&
reduce_compute
,
&
reduce_input
,
transformer
);
kps
::
Reduce
<
MPType
,
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
}
}
if
(
is_mean
)
{
if
(
is_mean
)
{
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
}
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
WriteData
<
Ty
,
1
,
1
,
1
,
false
>
(
kps
::
WriteData
<
Ty
,
1
,
1
,
false
>
(
y
+
store_offset
+
idx
,
&
result
,
block
.
BlockDimX
());
y
+
store_offset
+
idx
,
&
result
,
block
.
BlockDimX
());
}
}
...
@@ -835,20 +826,15 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
...
@@ -835,20 +826,15 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType
reduce_var
=
init
;
MPType
reduce_var
=
init
;
MPType
reduce_compute
=
init
;
MPType
reduce_compute
=
init
;
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
for
(
int
loop_idx
=
0
;
loop_idx
<
loop_size
;
++
loop_idx
)
{
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
1
,
true
>
(
&
reduce_input
,
kps
::
ReadData
<
Tx
,
Tx
,
1
,
1
,
true
>
(
&
reduce_input
,
input
+
loop_idx
*
left_num
+
idx
,
input
+
loop_idx
*
left_num
+
idx
,
dim
.
rem_x
,
dim
.
rem_x
,
1
,
1
,
1
,
1
,
left_num
);
left_num
);
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
1
,
TransformOp
>
(
kps
::
ElementwiseUnary
<
Tx
,
MPType
,
1
,
1
,
TransformOp
>
(
&
reduce_compute
,
&
reduce_input
,
transformer
);
&
reduce_compute
,
&
reduce_input
,
transformer
);
kps
::
Reduce
<
MPType
,
kps
::
Reduce
<
MPType
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
&
reduce_var
,
&
reduce_compute
,
reducer
,
false
);
}
}
...
@@ -856,8 +842,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
...
@@ -856,8 +842,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
reduce_var
=
reduce_var
/
static_cast
<
MPType
>
(
mean_div
);
}
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
WriteData
<
Ty
,
1
,
1
,
1
,
true
>
(
kps
::
WriteData
<
Ty
,
1
,
1
,
true
>
(
y
+
store_offset
+
idx
,
&
result
,
dim
.
rem_x
);
y
+
store_offset
+
idx
,
&
result
,
dim
.
rem_x
);
}
}
}
}
...
...
paddle/phi/kernels/funcs/select_impl.cu.h
浏览文件 @
1a0cd447
...
@@ -71,21 +71,21 @@ __device__ void GetBlockCountImpl(const InT *in,
...
@@ -71,21 +71,21 @@ __device__ void GetBlockCountImpl(const InT *in,
int
store_fix
=
BLOCK_ID_X
+
repeat
*
GRID_NUM_X
;
int
store_fix
=
BLOCK_ID_X
+
repeat
*
GRID_NUM_X
;
kps
::
Init
<
InT
,
VecSize
>
(
&
in_data
[
0
],
static_cast
<
InT
>
(
0.0
f
));
kps
::
Init
<
InT
,
VecSize
>
(
&
in_data
[
0
],
static_cast
<
InT
>
(
0.0
f
));
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ReadData
<
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
InT
,
OutT
,
VecSize
,
1
,
Cast
>
(
&
temp
[
0
],
&
in_data
[
0
],
Cast
());
&
temp
[
0
],
&
in_data
[
0
],
Cast
());
kps
::
Reduce
<
OutT
,
VecSize
,
1
,
1
,
Add
,
Mode
::
kLocalMode
>
(
kps
::
Reduce
<
OutT
,
VecSize
,
1
,
Add
,
Mode
::
kLocalMode
>
(
&
result
,
&
temp
[
0
],
Add
(),
true
);
&
result
,
&
temp
[
0
],
Add
(),
true
);
kps
::
Reduce
<
OutT
,
1
,
1
,
1
,
Add
,
Mode
::
kGlobalMode
>
(
kps
::
Reduce
<
OutT
,
1
,
1
,
Add
,
Mode
::
kGlobalMode
>
(
&
result
,
&
result
,
Add
(),
true
);
&
result
,
&
result
,
Add
(),
true
);
if
(
store_fix
==
0
)
{
if
(
store_fix
==
0
)
{
// first block's fix_size = 0;
// first block's fix_size = 0;
OutT
tmp
=
static_cast
<
OutT
>
(
0.0
f
);
OutT
tmp
=
static_cast
<
OutT
>
(
0.0
f
);
kps
::
WriteData
<
OutT
,
1
,
1
,
1
,
true
>
(
out
+
store_fix
,
&
tmp
,
1
);
kps
::
WriteData
<
OutT
,
1
,
1
,
true
>
(
out
+
store_fix
,
&
tmp
,
1
);
}
}
// store num of this block
// store num of this block
kps
::
WriteData
<
OutT
,
1
,
1
,
1
,
true
>
(
out
+
store_fix
+
1
,
&
result
,
1
);
kps
::
WriteData
<
OutT
,
1
,
1
,
true
>
(
out
+
store_fix
+
1
,
&
result
,
1
);
}
}
// Count how many data is not zero in current block
// Count how many data is not zero in current block
...
@@ -132,12 +132,12 @@ __device__ void CumsumImpl(
...
@@ -132,12 +132,12 @@ __device__ void CumsumImpl(
// set pre_cumsum
// set pre_cumsum
kps
::
Init
<
OutT
,
VecSize
>
(
&
temp
[
0
],
*
pre_cumsum
);
kps
::
Init
<
OutT
,
VecSize
>
(
&
temp
[
0
],
*
pre_cumsum
);
// load data to arg
// load data to arg
kps
::
ReadData
<
InT
,
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
ReadData
<
InT
,
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
arg
[
0
],
in
,
num
,
1
,
BLOCK_NUM_X
,
1
);
&
arg
[
0
],
in
,
num
,
1
,
BLOCK_NUM_X
,
1
);
// block cumsum
// block cumsum
kps
::
Cumsum
<
InT
,
OutT
,
1
,
Functor
>
(
&
result
[
0
],
&
arg
[
0
],
func
);
kps
::
Cumsum
<
InT
,
OutT
,
Functor
>
(
&
result
[
0
],
&
arg
[
0
],
func
);
// result = cumsum_result + pre_cumsum
// result = cumsum_result + pre_cumsum
kps
::
ElementwiseBinary
<
OutT
,
OutT
,
VecSize
,
1
,
1
,
Functor
>
(
kps
::
ElementwiseBinary
<
OutT
,
OutT
,
VecSize
,
1
,
Functor
>
(
&
result
[
0
],
&
result
[
0
],
&
temp
[
0
],
func
);
&
result
[
0
],
&
result
[
0
],
&
temp
[
0
],
func
);
// get the last prefix sum
// get the last prefix sum
if
((
THREAD_ID_X
==
BLOCK_NUM_X
-
1
)
&&
!
IsBoundary
)
{
if
((
THREAD_ID_X
==
BLOCK_NUM_X
-
1
)
&&
!
IsBoundary
)
{
...
@@ -146,7 +146,7 @@ __device__ void CumsumImpl(
...
@@ -146,7 +146,7 @@ __device__ void CumsumImpl(
__syncthreads
();
__syncthreads
();
// update pre_cumsum
// update pre_cumsum
*
pre_cumsum
=
max_thread_data
;
*
pre_cumsum
=
max_thread_data
;
kps
::
WriteData
<
OutT
,
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
kps
::
WriteData
<
OutT
,
OutT
,
VecSize
,
1
,
IsBoundary
>
(
out
,
&
result
[
0
],
num
,
1
,
BLOCK_NUM_X
,
1
);
out
,
&
result
[
0
],
num
,
1
,
BLOCK_NUM_X
,
1
);
}
}
...
@@ -189,7 +189,7 @@ struct SelectCaller {
...
@@ -189,7 +189,7 @@ struct SelectCaller {
int64_t
in_data
[
VecSize
];
int64_t
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
// set index
// set index
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
,
1
>
(
&
in_data
[
0
],
data_offset
);
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
>
(
&
in_data
[
0
],
data_offset
);
// Get store data according to mask_idt
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
int64_t
,
OutT
,
Functor
>
(
kps
::
OperatorTernary
<
MT
,
int64_t
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
...
@@ -215,7 +215,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
...
@@ -215,7 +215,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
int
num
)
{
int
num
)
{
InT
in_data
[
VecSize
];
InT
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
kps
::
ReadData
<
InT
,
VecSize
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
// Get store data according to mask_idt
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
...
@@ -244,7 +244,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
...
@@ -244,7 +244,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
kps
::
details
::
ReadData
<
InT
>
(
&
in_data
[
0
],
in
+
thread_fix
,
store_num
);
kps
::
details
::
ReadData
<
InT
>
(
&
in_data
[
0
],
in
+
thread_fix
,
store_num
);
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
}
}
};
};
...
@@ -285,16 +285,16 @@ __device__ void SelectKernelImpl(OutT *out,
...
@@ -285,16 +285,16 @@ __device__ void SelectKernelImpl(OutT *out,
kps
::
Init
<
IdT
,
kCVecSize
>
(
&
num_thread
[
0
],
init_idx
);
kps
::
Init
<
IdT
,
kCVecSize
>
(
&
num_thread
[
0
],
init_idx
);
kps
::
Init
<
MT
,
VecSize
>
(
&
mask_data
[
0
],
init_mask
);
kps
::
Init
<
MT
,
VecSize
>
(
&
mask_data
[
0
],
init_mask
);
// Load mask
// Load mask
kps
::
ReadData
<
MT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
mask_data
[
0
],
mask
,
num
);
kps
::
ReadData
<
MT
,
VecSize
,
1
,
IsBoundary
>
(
&
mask_data
[
0
],
mask
,
num
);
// Cast from MT to int
// Cast from MT to int
kps
::
ElementwiseUnary
<
MT
,
IdT
,
VecSize
,
1
,
1
,
Cast
>
(
kps
::
ElementwiseUnary
<
MT
,
IdT
,
VecSize
,
1
,
Cast
>
(
&
mask_idt
[
0
],
&
mask_data
[
0
],
Cast
());
&
mask_idt
[
0
],
&
mask_data
[
0
],
Cast
());
// Get the num of thread only num_thread[1] has data
// Get the num of thread only num_thread[1] has data
kps
::
Reduce
<
IdT
,
VecSize
,
1
,
1
,
Add
,
Mode
::
kLocalMode
>
(
kps
::
Reduce
<
IdT
,
VecSize
,
1
,
Add
,
Mode
::
kLocalMode
>
(
&
num_thread
[
0
],
&
mask_idt
[
0
],
Add
(),
true
);
&
num_thread
[
0
],
&
mask_idt
[
0
],
Add
(),
true
);
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// thread_fix
// thread_fix
kps
::
Cumsum
<
IdT
,
IdT
,
1
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
kps
::
Cumsum
<
IdT
,
IdT
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
// get thread_fix
// get thread_fix
int
thread_fix
=
int
thread_fix
=
(
static_cast
<
int
>
(
cumsum_thread
[
0
]
-
num_thread
[
0
])
*
store_rank
);
(
static_cast
<
int
>
(
cumsum_thread
[
0
]
-
num_thread
[
0
])
*
store_rank
);
...
...
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
浏览文件 @
1a0cd447
...
@@ -311,9 +311,9 @@ __global__ void WarpSoftmaxForward(T* softmax,
...
@@ -311,9 +311,9 @@ __global__ void WarpSoftmaxForward(T* softmax,
const
VecT
*
src_v
=
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[(
first_batch
+
i
)
*
stride
]);
reinterpret_cast
<
const
VecT
*>
(
&
src
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
src_data
[
i
][
0
][
0
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
src_data
[
i
][
0
][
0
]);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
reg_v
[
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
&
reg_v
[
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
kps
::
ElementwiseUnary
<
T
,
AccT
,
kVItem
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kVItem
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
src_data
[
i
][
0
][
0
],
DataTransFunctor
<
T
,
AccT
>
());
&
sub_data
[
i
][
0
][
0
],
&
src_data
[
i
][
0
][
0
],
DataTransFunctor
<
T
,
AccT
>
());
}
}
...
@@ -321,7 +321,6 @@ __global__ void WarpSoftmaxForward(T* softmax,
...
@@ -321,7 +321,6 @@ __global__ void WarpSoftmaxForward(T* softmax,
kps
::
Reduce
<
AccT
,
kps
::
Reduce
<
AccT
,
kVItem
,
kVItem
,
kBatchSize
,
kBatchSize
,
1
,
ReduceMaxFunctor
<
AccT
>
,
ReduceMaxFunctor
<
AccT
>
,
kMode
::
kLocalMode
>
(
kMode
::
kLocalMode
>
(
&
max
[
0
],
&
sub_data
[
0
][
0
][
0
],
ReduceMaxFunctor
<
AccT
>
(),
true
);
&
max
[
0
],
&
sub_data
[
0
][
0
][
0
],
ReduceMaxFunctor
<
AccT
>
(),
true
);
...
@@ -330,15 +329,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
...
@@ -330,15 +329,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
// compute sum
// compute sum
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kBatchSize
;
++
i
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
sub_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
max
[
i
]));
&
sub_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
max
[
i
]));
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
ExpFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpFunctor
<
AccT
>>
(
&
exp_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
ExpFunctor
<
AccT
>
());
&
exp_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
ExpFunctor
<
AccT
>
());
}
}
kps
::
Reduce
<
AccT
,
kps
::
Reduce
<
AccT
,
kVItem
,
kVItem
,
kBatchSize
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kLocalMode
>
(
kMode
::
kLocalMode
>
(
&
sum
[
0
],
&
exp_data
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
&
sum
[
0
],
&
exp_data
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
...
@@ -351,15 +349,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
...
@@ -351,15 +349,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
reinterpret_cast
<
VecT
*>
(
&
softmax
[(
first_batch
+
i
)
*
stride
]);
reinterpret_cast
<
VecT
*>
(
&
softmax
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
if
(
LogMode
)
{
if
(
LogMode
)
{
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
out_tmp
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
&
sub_data
[
i
][
0
][
0
],
UnarySubFunctor
<
AccT
>
(
std
::
log
(
sum
[
i
])));
UnarySubFunctor
<
AccT
>
(
std
::
log
(
sum
[
i
])));
}
else
{
}
else
{
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
1
,
UnaryDivFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
T
,
kVItem
,
1
,
UnaryDivFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
exp_data
[
i
][
0
][
0
],
UnaryDivFunctor
<
AccT
>
(
sum
[
i
]));
&
out_tmp
[
i
][
0
][
0
],
&
exp_data
[
i
][
0
][
0
],
UnaryDivFunctor
<
AccT
>
(
sum
[
i
]));
}
}
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
softmax_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
&
softmax_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
}
}
}
}
...
@@ -417,9 +415,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
...
@@ -417,9 +415,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
int
ptr
=
(
first_batch
+
i
)
*
stride
;
int
ptr
=
(
first_batch
+
i
)
*
stride
;
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[
ptr
]);
const
VecT
*
src_v
=
reinterpret_cast
<
const
VecT
*>
(
&
src
[
ptr
]);
const
VecT
*
grad_v
=
reinterpret_cast
<
const
VecT
*>
(
&
grad
[
ptr
]);
const
VecT
*
grad_v
=
reinterpret_cast
<
const
VecT
*>
(
&
grad
[
ptr
]);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
src_reg
[
i
][
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
&
src_reg
[
i
][
0
],
&
src_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
ReadData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
grad_reg
[
i
][
0
],
&
grad_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
&
grad_reg
[
i
][
0
],
&
grad_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
flag
);
}
}
...
@@ -430,9 +428,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
...
@@ -430,9 +428,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
const
T
*
grad_ptr
=
reinterpret_cast
<
const
T
*>
(
&
grad_reg
[
0
][
0
]);
const
T
*
grad_ptr
=
reinterpret_cast
<
const
T
*>
(
&
grad_reg
[
0
][
0
]);
constexpr
int
kStep
=
kBatchSize
*
kLoopsV
*
kVSize
;
constexpr
int
kStep
=
kBatchSize
*
kLoopsV
*
kVSize
;
constexpr
int
kVItem
=
kLoopsV
*
kVSize
;
constexpr
int
kVItem
=
kLoopsV
*
kVSize
;
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
src_tmp
[
0
][
0
][
0
],
&
src_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
&
src_tmp
[
0
][
0
][
0
],
&
src_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
kps
::
ElementwiseUnary
<
T
,
AccT
,
kStep
,
1
,
DataTransFunctor
<
T
,
AccT
>>
(
&
grad_tmp
[
0
][
0
][
0
],
&
grad_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
&
grad_tmp
[
0
][
0
][
0
],
&
grad_ptr
[
0
],
DataTransFunctor
<
T
,
AccT
>
());
// compute sum
// compute sum
...
@@ -444,17 +442,15 @@ __global__ void WarpSoftmaxBackward(T* dst,
...
@@ -444,17 +442,15 @@ __global__ void WarpSoftmaxBackward(T* dst,
kps
::
Reduce
<
AccT
,
kps
::
Reduce
<
AccT
,
kVItem
,
kVItem
,
kBatchSize
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
sum
[
0
],
&
grad_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
&
sum
[
0
],
&
grad_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
}
else
{
}
else
{
kps
::
ElementwiseBinary
<
AccT
,
AccT
,
kStep
,
1
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
AccT
,
kStep
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
&
sum_tmp
[
0
][
0
][
0
],
&
gradptr
[
0
],
&
srcptr
[
0
],
kps
::
MulFunctor
<
AccT
>
());
&
sum_tmp
[
0
][
0
][
0
],
&
gradptr
[
0
],
&
srcptr
[
0
],
kps
::
MulFunctor
<
AccT
>
());
kps
::
Reduce
<
AccT
,
kps
::
Reduce
<
AccT
,
kVItem
,
kVItem
,
kBatchSize
,
kBatchSize
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
AddFunctor
<
AccT
>
,
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
kps
::
details
::
ReduceMode
::
kLocalMode
>
(
&
sum
[
0
],
&
sum_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
&
sum
[
0
],
&
sum_tmp
[
0
][
0
][
0
],
kps
::
AddFunctor
<
AccT
>
(),
true
);
...
@@ -470,17 +466,17 @@ __global__ void WarpSoftmaxBackward(T* dst,
...
@@ -470,17 +466,17 @@ __global__ void WarpSoftmaxBackward(T* dst,
AccT
*
gradptr
=
reinterpret_cast
<
AccT
*>
(
&
grad_tmp
[
i
][
0
][
0
]);
AccT
*
gradptr
=
reinterpret_cast
<
AccT
*>
(
&
grad_tmp
[
i
][
0
][
0
]);
AccT
*
srcptr
=
reinterpret_cast
<
AccT
*>
(
&
src_tmp
[
i
][
0
][
0
]);
AccT
*
srcptr
=
reinterpret_cast
<
AccT
*>
(
&
src_tmp
[
i
][
0
][
0
]);
if
(
LogMode
)
{
if
(
LogMode
)
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
ExpMulFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
ExpMulFunctor
<
AccT
>>
(
&
out
[
i
][
0
][
0
],
&
srcptr
[
0
],
ExpMulFunctor
<
AccT
>
(
sum
[
i
]));
&
out
[
i
][
0
][
0
],
&
srcptr
[
0
],
ExpMulFunctor
<
AccT
>
(
sum
[
i
]));
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
1
,
kps
::
SubFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
kps
::
SubFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
out_tmp
[
i
][
0
][
0
],
&
gradptr
[
0
],
&
gradptr
[
0
],
&
out
[
i
][
0
][
0
],
&
out
[
i
][
0
][
0
],
kps
::
SubFunctor
<
AccT
>
());
kps
::
SubFunctor
<
AccT
>
());
}
else
{
}
else
{
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
1
,
UnarySubFunctor
<
AccT
>>
(
kps
::
ElementwiseUnary
<
AccT
,
AccT
,
kVItem
,
1
,
UnarySubFunctor
<
AccT
>>
(
&
out
[
i
][
0
][
0
],
&
gradptr
[
0
],
UnarySubFunctor
<
AccT
>
(
sum
[
i
]));
&
out
[
i
][
0
][
0
],
&
gradptr
[
0
],
UnarySubFunctor
<
AccT
>
(
sum
[
i
]));
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
kps
::
ElementwiseBinary
<
AccT
,
T
,
kVItem
,
1
,
kps
::
MulFunctor
<
AccT
>>
(
&
out_tmp
[
i
][
0
][
0
],
&
out_tmp
[
i
][
0
][
0
],
&
srcptr
[
0
],
&
srcptr
[
0
],
&
out
[
i
][
0
][
0
],
&
out
[
i
][
0
][
0
],
...
@@ -488,7 +484,7 @@ __global__ void WarpSoftmaxBackward(T* dst,
...
@@ -488,7 +484,7 @@ __global__ void WarpSoftmaxBackward(T* dst,
}
}
VecT
*
dst_v
=
reinterpret_cast
<
VecT
*>
(
&
dst
[(
first_batch
+
i
)
*
stride
]);
VecT
*
dst_v
=
reinterpret_cast
<
VecT
*>
(
&
dst
[(
first_batch
+
i
)
*
stride
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
VecT
*
reg_v
=
reinterpret_cast
<
VecT
*>
(
&
out_tmp
[
i
][
0
][
0
]);
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
1
,
true
>
(
kps
::
WriteData
<
VecT
,
VecT
,
kLoopsV
,
1
,
true
>
(
&
dst_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
&
dst_v
[
0
],
&
reg_v
[
0
],
idx_max_v
[
i
],
0
,
kWarpSize
,
1
);
}
}
}
}
...
@@ -636,7 +632,7 @@ __global__ void NormalSoftmaxForward(
...
@@ -636,7 +632,7 @@ __global__ void NormalSoftmaxForward(
}
}
if
(
blockDim
.
y
>
1
)
{
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
MaxFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
MaxFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
max_value
,
&
max_value
,
kps
::
MaxFunctor
<
AccT
>
(),
false
);
&
max_value
,
&
max_value
,
kps
::
MaxFunctor
<
AccT
>
(),
false
);
}
}
...
@@ -647,7 +643,7 @@ __global__ void NormalSoftmaxForward(
...
@@ -647,7 +643,7 @@ __global__ void NormalSoftmaxForward(
sum
+=
std
::
exp
(
value
-
max_value
);
sum
+=
std
::
exp
(
value
-
max_value
);
}
}
if
(
blockDim
.
y
>
1
)
{
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
}
}
...
@@ -695,7 +691,7 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
...
@@ -695,7 +691,7 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
}
}
}
}
if
(
blockDim
.
y
>
1
)
{
if
(
blockDim
.
y
>
1
)
{
kps
::
Reduce
<
AccT
,
1
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
kps
::
Reduce
<
AccT
,
1
,
1
,
kps
::
AddFunctor
<
AccT
>
,
kMode
::
kGlobalMode
>
(
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
&
sum
,
&
sum
,
kps
::
AddFunctor
<
AccT
>
(),
false
);
}
}
...
...
paddle/phi/kernels/primitive/compute_primitives.h
浏览文件 @
1a0cd447
...
@@ -200,7 +200,6 @@ __device__ inline int GetLastPow2(int n) {
...
@@ -200,7 +200,6 @@ __device__ inline int GetLastPow2(int n) {
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* template <typename InT, typename OutT>
...
@@ -215,12 +214,7 @@ __device__ inline int GetLastPow2(int n) {
...
@@ -215,12 +214,7 @@ __device__ inline int GetLastPow2(int n) {
* in: The register pointer of in, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
...
@@ -239,7 +233,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
...
@@ -239,7 +233,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
* template <typename InT>
...
@@ -255,12 +248,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
...
@@ -255,12 +248,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
...
@@ -271,12 +259,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
...
@@ -271,12 +259,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
}
}
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
#pragma unroll
#pragma unroll
...
@@ -294,7 +277,6 @@ __device__ __forceinline__ void ElementwiseBinary(
...
@@ -294,7 +277,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
* template <typename InT>
...
@@ -312,12 +294,7 @@ __device__ __forceinline__ void ElementwiseBinary(
...
@@ -312,12 +294,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
...
@@ -335,7 +312,6 @@ __device__ __forceinline__ void ElementwiseTernary(
...
@@ -335,7 +312,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Arity: The size of ins.
* Arity: The size of ins.
* OpFunc: Compute functor which has an operator() as following:
* OpFunc: Compute functor which has an operator() as following:
...
@@ -351,13 +327,7 @@ __device__ __forceinline__ void ElementwiseTernary(
...
@@ -351,13 +327,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
Arity
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
OpFunc
compute
)
{
...
@@ -382,7 +352,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
...
@@ -382,7 +352,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
* template <typename InT, typename OutT>
...
@@ -398,12 +367,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
...
@@ -398,12 +367,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
...
@@ -428,7 +392,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
...
@@ -428,7 +392,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* ReduceFunctor: Compute functor which has an operator() as following
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
* template <typename InT>
...
@@ -448,7 +411,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
...
@@ -448,7 +411,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template
<
typename
T
,
template
<
typename
T
,
int
NX
,
int
NX
,
int
NY
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
__device__
__forceinline__
void
Reduce
(
T
*
out
,
...
@@ -494,7 +456,6 @@ __device__ __forceinline__ void Reduce(T* out,
...
@@ -494,7 +456,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
* template <typename InT>
...
@@ -509,12 +470,7 @@ __device__ __forceinline__ void Reduce(T* out,
...
@@ -509,12 +470,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
@@ -532,7 +488,6 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
...
@@ -532,7 +488,6 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* hiprandStatePhilox4_32_10_t.
* hiprandStatePhilox4_32_10_t.
* OutT: the type of out register.
* OutT: the type of out register.
* ReturnsCount: The number of random data generated by OpFunc.
* ReturnsCount: The number of random data generated by OpFunc.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
* template <typename T>
...
@@ -549,11 +504,7 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
...
@@ -549,11 +504,7 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* compute: Compute function which was declared like OpFunc<T>().
* compute: Compute function which was declared like OpFunc<T>().
*/
*/
template
<
typename
StateType
,
template
<
typename
StateType
,
typename
OutT
,
int
ReturnsCount
,
class
OpFunc
>
typename
OutT
,
int
ReturnsCount
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseRandom
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseRandom
(
OutT
*
out
,
OpFunc
compute
,
OpFunc
compute
,
StateType
*
state
)
{
StateType
*
state
)
{
...
@@ -571,7 +522,6 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
...
@@ -571,7 +522,6 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
* @template paraments
* @template paraments
* InT: the type of input register.
* InT: the type of input register.
* OutT: the type of out register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
* template <typename T>
...
@@ -589,7 +539,7 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
...
@@ -589,7 +539,7 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
*/
*/
#define SHARED_SIZE_LIMIT 512
#define SHARED_SIZE_LIMIT 512
template
<
typename
InT
,
typename
OutT
,
int
BlockSize
,
class
OpFunc
>
template
<
typename
InT
,
typename
OutT
,
class
OpFunc
>
__device__
__forceinline__
void
Cumsum
(
OutT
*
out
,
__device__
__forceinline__
void
Cumsum
(
OutT
*
out
,
const
InT
*
in
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
...
@@ -632,7 +582,6 @@ __device__ __forceinline__ void Cumsum(OutT* out,
...
@@ -632,7 +582,6 @@ __device__ __forceinline__ void Cumsum(OutT* out,
* @template paraments
* @template paraments
* InT: the type of input register.
* InT: the type of input register.
* OutT: the type of out register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* GPU was supported.
*
*
* @param
* @param
...
@@ -645,7 +594,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
...
@@ -645,7 +594,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
#define SHARED_SIZE_LIMIT 1024
#define SHARED_SIZE_LIMIT 1024
// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
// larger than blockDim.x * 2
// larger than blockDim.x * 2
template
<
typename
InT
,
typename
OutT
,
int
BlockSize
>
template
<
typename
InT
,
typename
OutT
>
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
const
InT
*
in
,
const
InT
*
in
,
int
num
,
int
num
,
...
@@ -689,7 +638,6 @@ __device__ __forceinline__ void Sort(OutT* out,
...
@@ -689,7 +638,6 @@ __device__ __forceinline__ void Sort(OutT* out,
* InT: The type of input register.
* InT: The type of input register.
* OutT: The type of out register.
* OutT: The type of out register.
* IndexType: The type of index.
* IndexType: The type of index.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* GPU was supported.
*
*
* @param
* @param
...
@@ -701,7 +649,7 @@ __device__ __forceinline__ void Sort(OutT* out,
...
@@ -701,7 +649,7 @@ __device__ __forceinline__ void Sort(OutT* out,
* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
* sorted in escending.
* sorted in escending.
*/
*/
template
<
typename
InT
,
typename
OutT
,
typename
IndexType
,
int
BlockSize
>
template
<
typename
InT
,
typename
OutT
,
typename
IndexType
>
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
__device__
__forceinline__
void
Sort
(
OutT
*
out
,
IndexType
*
out_index
,
IndexType
*
out_index
,
const
InT
*
in
,
const
InT
*
in
,
...
...
paddle/phi/kernels/primitive/compute_primitives_xpu2.h
浏览文件 @
1a0cd447
...
@@ -89,7 +89,6 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
...
@@ -89,7 +89,6 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* template <typename InT, typename OutT>
...
@@ -104,12 +103,7 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
...
@@ -104,12 +103,7 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
...
@@ -128,7 +122,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
...
@@ -128,7 +122,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
* template <typename InT>
...
@@ -144,12 +137,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
...
@@ -144,12 +137,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
...
@@ -160,12 +148,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
...
@@ -160,12 +148,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
}
}
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
,
int
read_lens
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
...
@@ -182,7 +165,6 @@ __device__ __forceinline__ void ElementwiseBinary(
...
@@ -182,7 +165,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
* template <typename InT>
...
@@ -200,12 +182,7 @@ __device__ __forceinline__ void ElementwiseBinary(
...
@@ -200,12 +182,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
...
@@ -223,7 +200,6 @@ __device__ __forceinline__ void ElementwiseTernary(
...
@@ -223,7 +200,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* Arity: The size of ins
* Arity: The size of ins
* OpFunc: Compute functor which has an operator() as following:
* OpFunc: Compute functor which has an operator() as following:
...
@@ -239,13 +215,7 @@ __device__ __forceinline__ void ElementwiseTernary(
...
@@ -239,13 +215,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
Arity
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
OpFunc
compute
)
{
...
@@ -270,7 +240,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
...
@@ -270,7 +240,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
* template <typename InT, typename OutT>
...
@@ -286,12 +255,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
...
@@ -286,12 +255,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
...
@@ -316,7 +280,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
...
@@ -316,7 +280,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* ReduceFunctor: Compute functor which has an operator() as following
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
* template <typename InT>
...
@@ -336,7 +299,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
...
@@ -336,7 +299,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template
<
typename
T
,
template
<
typename
T
,
int
NX
,
int
NX
,
int
NY
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
__device__
__forceinline__
void
Reduce
(
T
*
out
,
...
@@ -369,7 +331,6 @@ __device__ __forceinline__ void Reduce(T* out,
...
@@ -369,7 +331,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
* template <typename InT>
...
@@ -384,12 +345,7 @@ __device__ __forceinline__ void Reduce(T* out,
...
@@ -384,12 +345,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
class
OpFunc
>
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
__device__
__forceinline__
void
ElementwiseConstant
(
OutT
*
out
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
1a0cd447
...
@@ -144,7 +144,6 @@ __device__ __forceinline__ void ReadData(T* dst,
...
@@ -144,7 +144,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* Ty: The type of data that needs to be stored in registers.
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -161,12 +160,7 @@ __device__ __forceinline__ void ReadData(T* dst,
...
@@ -161,12 +160,7 @@ __device__ __forceinline__ void ReadData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
const
Tx
*
__restrict__
src
,
int
size_nx
,
int
size_nx
,
...
@@ -275,7 +269,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
...
@@ -275,7 +269,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
* When the number of data processed by this block is less than
...
@@ -287,7 +280,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
...
@@ -287,7 +280,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
* size: The current block needs to load size data continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
src
,
int
num
)
{
int
num
)
{
...
@@ -319,7 +312,7 @@ __device__ __forceinline__ void ReadData(T* dst,
...
@@ -319,7 +312,7 @@ __device__ __forceinline__ void ReadData(T* dst,
}
}
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
src
,
int
num
,
int
num
,
...
@@ -361,7 +354,6 @@ __device__ __forceinline__ void ReadData(T* dst,
...
@@ -361,7 +354,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
* When the number of data processed by this block is less than
...
@@ -376,7 +368,6 @@ __device__ __forceinline__ void ReadData(T* dst,
...
@@ -376,7 +368,6 @@ __device__ __forceinline__ void ReadData(T* dst,
template
<
typename
T
,
template
<
typename
T
,
int
NX
,
int
NX
,
int
NY
,
int
NY
,
int
BlockSize
,
typename
ArgsT
,
typename
ArgsT
,
int
Index
,
int
Index
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
...
@@ -419,7 +410,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
...
@@ -419,7 +410,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
@@ -437,7 +427,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
...
@@ -437,7 +427,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
T
*
dst
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
src
,
...
@@ -479,7 +469,6 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -479,7 +469,6 @@ __device__ __forceinline__ void ReadDataBc(
* T: The type of data.
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
@@ -507,7 +496,6 @@ template <typename Tx,
...
@@ -507,7 +496,6 @@ template <typename Tx,
typename
Ty
,
typename
Ty
,
int
NX
,
int
NX
,
int
NY
,
int
NY
,
int
BlockSize
,
int
Rank
,
int
Rank
,
typename
IndexCal
,
typename
IndexCal
,
typename
Functor
,
typename
Functor
,
...
@@ -572,7 +560,6 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
...
@@ -572,7 +560,6 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* T: The type of data.
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -584,7 +571,7 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
...
@@ -584,7 +571,7 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* src: The register pointer, the size is NX * NY.
* src: The register pointer, the size is NX * NY.
* size: The current block needs to load size elements continuously.
* size: The current block needs to load size elements continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
T
*
__restrict__
src
,
int
num
)
{
int
num
)
{
...
@@ -613,7 +600,7 @@ __device__ __forceinline__ void WriteData(T* dst,
...
@@ -613,7 +600,7 @@ __device__ __forceinline__ void WriteData(T* dst,
}
}
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
T
*
__restrict__
src
,
int
num
,
int
num
,
...
@@ -652,7 +639,6 @@ __device__ __forceinline__ void WriteData(T* dst,
...
@@ -652,7 +639,6 @@ __device__ __forceinline__ void WriteData(T* dst,
* Ty: The type of data that stored in the global memory.
* Ty: The type of data that stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -669,12 +655,7 @@ __device__ __forceinline__ void WriteData(T* dst,
...
@@ -669,12 +655,7 @@ __device__ __forceinline__ void WriteData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
const
Tx
*
__restrict__
src
,
int
size_nx
,
int
size_nx
,
...
@@ -766,7 +747,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
...
@@ -766,7 +747,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* T: The type of data stored in the global memory.
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
@@ -782,7 +762,7 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
...
@@ -782,7 +762,7 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
* total_num_output: Total number of original output.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
T
*
dst
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
src
,
...
@@ -820,14 +800,13 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -820,14 +800,13 @@ __device__ __forceinline__ void ReadDataBc(
* T: Data type of register.
* T: Data type of register.
* NX: Number of data to initialize.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
*
*
* @param:
* @param:
* dst: The register pointer of the thread, the size is NX.
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
>
template
<
typename
T
,
int
NX
,
int
NY
>
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
int
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
int
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
#pragma unroll
#pragma unroll
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
1a0cd447
...
@@ -337,7 +337,6 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
...
@@ -337,7 +337,6 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* Ty: The type of data that needs to be stored in registers.
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -354,12 +353,7 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
...
@@ -354,12 +353,7 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
const
Tx
_global_ptr_
*
src
,
const
Tx
_global_ptr_
*
src
,
int
size_nx
,
int
size_nx
,
...
@@ -472,7 +466,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
...
@@ -472,7 +466,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
* When the number of data processed by this block is less than
...
@@ -484,7 +477,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
...
@@ -484,7 +477,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
* size: The current block needs to load size data continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
const
T
_global_ptr_
*
src
,
int
num
)
{
int
num
)
{
...
@@ -502,7 +495,7 @@ __device__ __inline__ void ReadData(T* dst,
...
@@ -502,7 +495,7 @@ __device__ __inline__ void ReadData(T* dst,
}
}
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
const
T
_global_ptr_
*
src
,
int
num
,
int
num
,
...
@@ -531,7 +524,6 @@ __device__ __inline__ void ReadData(T* dst,
...
@@ -531,7 +524,6 @@ __device__ __inline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
* When the number of data processed by this block is less than
...
@@ -546,7 +538,6 @@ __device__ __inline__ void ReadData(T* dst,
...
@@ -546,7 +538,6 @@ __device__ __inline__ void ReadData(T* dst,
template
<
typename
T
,
template
<
typename
T
,
int
NX
,
int
NX
,
int
NY
,
int
NY
,
int
BlockSize
,
typename
ArgsT
,
typename
ArgsT
,
int
Index
,
int
Index
,
bool
IsBoundary
>
bool
IsBoundary
>
...
@@ -582,7 +573,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
...
@@ -582,7 +573,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -599,7 +589,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
...
@@ -599,7 +589,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
uint32_t
block_offset
,
...
@@ -634,7 +624,6 @@ __device__ __inline__ void ReadDataBc(T* dst,
...
@@ -634,7 +624,6 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: The type of data.
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
...
@@ -662,7 +651,6 @@ template <typename Tx,
...
@@ -662,7 +651,6 @@ template <typename Tx,
typename
Ty
,
typename
Ty
,
int
NX
,
int
NX
,
int
NY
,
int
NY
,
int
BlockSize
,
int
Rank
,
int
Rank
,
typename
IndexCal
,
typename
IndexCal
,
typename
Functor
,
typename
Functor
,
...
@@ -733,7 +721,6 @@ __device__ __forceinline__ void ReadDataReduce(
...
@@ -733,7 +721,6 @@ __device__ __forceinline__ void ReadDataReduce(
* T: The type of data.
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -746,7 +733,7 @@ __device__ __forceinline__ void ReadDataReduce(
...
@@ -746,7 +733,7 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
* size: The current block needs to load size elements continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
const
T
*
src
,
int
num
,
int
num
,
...
@@ -766,7 +753,7 @@ __device__ void WriteData(T _global_ptr_* dst,
...
@@ -766,7 +753,7 @@ __device__ void WriteData(T _global_ptr_* dst,
}
}
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
>
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
int
num
)
{
__device__
void
WriteData
(
T
_global_ptr_
*
dst
,
const
T
*
src
,
int
num
)
{
int
thread_offset
=
core_id
()
*
NX
;
int
thread_offset
=
core_id
()
*
NX
;
mfence_local
();
mfence_local
();
...
@@ -793,7 +780,6 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
...
@@ -793,7 +780,6 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* Ty: The type of data stored in the global memory.
* Ty: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -810,12 +796,7 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
...
@@ -810,12 +796,7 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
const
Tx
*
src
,
const
Tx
*
src
,
int
size_nx
,
int
size_nx
,
...
@@ -1190,7 +1171,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
...
@@ -1190,7 +1171,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* T: The type of data stored in the global memory.
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* judgment. When the number of data processed by the block is less than
...
@@ -1206,7 +1186,7 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
...
@@ -1206,7 +1186,7 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* read_lens: The number of data continuously loaded by each thread.
* read_lens: The number of data continuously loaded by each thread.
* total_num_output: Total number of original output.
* total_num_output: Total number of original output.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
uint32_t
block_offset
,
...
@@ -1238,14 +1218,13 @@ __device__ __inline__ void ReadDataBc(T* dst,
...
@@ -1238,14 +1218,13 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: Data type of register.
* T: Data type of register.
* NX: Number of data to initialize.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* core_id() is used as the index.
*
*
* @param:
* @param:
* dst: The register pointer of the thread, the size is NX.
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
>
template
<
typename
T
,
int
NX
,
int
NY
>
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
__device__
__forceinline__
void
InitWithDataIndex
(
T
*
dst
,
int
block_offset
)
{
int
thread_offset
=
block_offset
+
core_id
()
*
NX
;
int
thread_offset
=
block_offset
+
core_id
()
*
NX
;
#pragma unroll
#pragma unroll
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录