Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
9c5d5665
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9c5d5665
编写于
11月 17, 2021
作者:
N
niuliling123
提交者:
GitHub
11月 17, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Modify reduce_op.op.h for xpu2 with kernel primitive api (#36904)
* Modify reduce_op.op.h for xpu2 with kernel primitive api
上级
d08753df
变更
6
展开全部
显示空白变更内容
内联
并排
Showing
6 changed file
with
407 addition
and
239 deletion
+407
-239
paddle/fluid/operators/kernel_primitives/datamover_primitives.h
.../fluid/operators/kernel_primitives/datamover_primitives.h
+6
-6
paddle/fluid/operators/kernel_primitives/helper_primitives.h
paddle/fluid/operators/kernel_primitives/helper_primitives.h
+32
-47
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+36
-2
paddle/fluid/operators/margin_cross_entropy_op.cu
paddle/fluid/operators/margin_cross_entropy_op.cu
+4
-4
paddle/fluid/operators/reduce_ops/reduce_functor_op.h
paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+8
-8
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+321
-172
未找到文件。
paddle/fluid/operators/kernel_primitives/datamover_primitives.h
浏览文件 @
9c5d5665
...
@@ -360,12 +360,12 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -360,12 +360,12 @@ __device__ __forceinline__ void ReadDataBc(
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
* the lowest dimension.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
x
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
bool
IsBoundary
=
false
>
typename
IndexCal
,
typename
Functor
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataReduce
(
__device__
__forceinline__
void
ReadDataReduce
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
block_offset
,
T
y
*
dst
,
const
Tx
*
__restrict__
src
,
int
block_offset
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
,
bool
reduce_last_dim
)
{
int
stride_ny
,
Functor
func
,
bool
reduce_last_dim
)
{
int
thread_offset
=
0
;
int
thread_offset
=
0
;
int
left_idx
=
0
;
int
left_idx
=
0
;
if
(
reduce_last_dim
)
{
if
(
reduce_last_dim
)
{
...
@@ -385,7 +385,7 @@ __device__ __forceinline__ void ReadDataReduce(
...
@@ -385,7 +385,7 @@ __device__ __forceinline__ void ReadDataReduce(
}
}
}
}
uint32_t
index_src
=
index_cal
(
thread_offset
+
block_offset
);
uint32_t
index_src
=
index_cal
(
thread_offset
+
block_offset
);
dst
[
ny
]
=
s
rc
[
index_src
]
;
dst
[
ny
]
=
s
tatic_cast
<
Ty
>
(
func
(
src
[
index_src
]))
;
thread_offset
+=
stride_ny
;
thread_offset
+=
stride_ny
;
}
}
}
else
{
}
else
{
...
@@ -400,7 +400,7 @@ __device__ __forceinline__ void ReadDataReduce(
...
@@ -400,7 +400,7 @@ __device__ __forceinline__ void ReadDataReduce(
}
}
}
}
uint32_t
index_src
=
index_cal
(
thread_offset
+
block_offset
);
uint32_t
index_src
=
index_cal
(
thread_offset
+
block_offset
);
dst
[
nx
+
ny
*
NX
]
=
s
rc
[
index_src
]
;
dst
[
nx
+
ny
*
NX
]
=
s
tatic_cast
<
Ty
>
(
func
(
src
[
index_src
]))
;
thread_offset
+=
stride_ny
;
thread_offset
+=
stride_ny
;
}
}
}
}
...
...
paddle/fluid/operators/kernel_primitives/helper_primitives.h
浏览文件 @
9c5d5665
...
@@ -17,64 +17,49 @@
...
@@ -17,64 +17,49 @@
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
kernel_primitives
{
namespace
kernel_primitives
{
namespace
details
{
static
__device__
__forceinline__
platform
::
float16
ExpFunctor
(
#ifdef PADDLE_WITH_XPU2
platform
::
float16
x
)
{
struct
dim3
{
return
::
Eigen
::
numext
::
exp
(
x
);
int
x
;
}
int
y
;
static
__device__
__forceinline__
float
ExpFunctor
(
float
x
)
{
return
expf
(
x
);
}
int
z
;
static
__device__
__forceinline__
double
ExpFunctor
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
LogFunctor
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
log
(
x
);
}
static
__device__
__forceinline__
float
LogFunctor
(
float
x
)
{
return
logf
(
x
);
}
static
__device__
__forceinline__
double
LogFunctor
(
double
x
)
{
return
log
(
x
);
}
/*************************** Compute Functor****************************/
explicit
inline
dim3
(
int
split_x
,
int
split_y
=
1
,
int
split_z
=
1
)
{
// for margin_cross_entropy
x
=
split_x
;
template
<
typename
Tx
,
typename
Ty
=
Tx
>
y
=
split_y
;
struct
ExpLogitTransformer
{
z
=
split_z
;
HOSTDEVICE
explicit
inline
ExpLogitTransformer
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
*
x
)
const
{
return
static_cast
<
Ty
>
(
details
::
ExpFunctor
(
x
[
0
]));
}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
&
x
)
const
{
return
static_cast
<
Ty
>
(
details
::
ExpFunctor
(
x
));
}
}
};
};
#endif
// Post processing function for sum, max, min, prod, any
struct
DimConfig
{
template
<
typename
Tx
,
typename
Ty
=
Tx
>
int
split_num_x
;
struct
IdentityFunctor
{
int
split_num_y
;
HOSTDEVICE
explicit
inline
IdentityFunctor
(
int
n
)
{}
int
split_num_z
;
int
deal_size_x
;
int
deal_size_y
;
int
deal_size_z
;
int
rem_x
;
int
rem_y
;
int
rem_z
;
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
*
x
)
const
{
HOSTDEVICE
explicit
inline
DimConfig
(
int
split_x
,
int
split_y
,
int
split_z
,
return
static_cast
<
Ty
>
(
x
[
0
]);
int
size_x
,
int
size_y
,
int
size_z
)
{
split_num_x
=
split_x
;
split_num_y
=
split_y
;
split_num_z
=
split_z
;
deal_size_x
=
size_x
;
deal_size_y
=
size_y
;
deal_size_z
=
size_z
;
}
}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
&
x
)
const
{
HOSTDEVICE
void
SetRem
(
int
rem_nx
,
int
rem_ny
,
int
rem_nz
)
{
return
static_cast
<
Ty
>
(
x
);
rem_x
=
rem_nx
;
rem_y
=
rem_ny
;
rem_z
=
rem_nz
;
}
}
};
};
// Post processing function for mean
template
<
typename
T
>
struct
DivideFunctor
{
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
T
)(
1.0
/
n
))
{}
HOSTDEVICE
inline
T
operator
()(
const
T
*
x
)
const
{
return
x
[
0
]
*
n_inv
;
}
HOSTDEVICE
inline
T
operator
()(
const
T
&
x
)
const
{
return
x
*
n_inv
;
}
private:
T
n_inv
;
};
}
// namespace details
}
// namespace kernel_primitives
}
// namespace kernel_primitives
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
浏览文件 @
9c5d5665
...
@@ -13,11 +13,45 @@
...
@@ -13,11 +13,45 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#define THREAD_ID_X threadIdx.x
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/margin_cross_entropy_op.cu
浏览文件 @
9c5d5665
...
@@ -130,7 +130,7 @@ __global__ void AddMarginToPositiveLogitsKernel(
...
@@ -130,7 +130,7 @@ __global__ void AddMarginToPositiveLogitsKernel(
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpAndSum
{
struct
ExpAndSum
{
using
Transformer
=
kp
ds
::
ExpLogitTransforme
r
<
Tx
>
;
using
Transformer
=
kp
s
::
ExpFuncto
r
<
Tx
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
...
@@ -159,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
...
@@ -159,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
const
int64_t
N
,
const
int64_t
D
)
{
const
int64_t
N
,
const
int64_t
D
)
{
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
auto
row
=
i
/
D
;
auto
row
=
i
/
D
;
logits
[
i
]
-=
kp
ds
::
LogFunctor
(
logits_sum_per_row
[
row
]);
logits
[
i
]
-=
kp
s
::
details
::
Log
(
logits_sum_per_row
[
row
]);
}
}
}
}
...
@@ -174,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel(
...
@@ -174,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel(
if
((
col
+
start_index
)
==
labels
[
row
])
{
if
((
col
+
start_index
)
==
labels
[
row
])
{
auto
softmax
=
log_softmax
[
i
];
auto
softmax
=
log_softmax
[
i
];
loss
[
row
]
=
-
softmax
;
loss
[
row
]
=
-
softmax
;
log_softmax
[
i
]
=
kp
ds
::
ExpFunctor
(
softmax
);
log_softmax
[
i
]
=
kp
s
::
details
::
Exp
(
softmax
);
}
else
{
}
else
{
log_softmax
[
i
]
=
kp
ds
::
ExpFunctor
(
log_softmax
[
i
]);
log_softmax
[
i
]
=
kp
s
::
details
::
Exp
(
log_softmax
[
i
]);
}
}
}
}
}
}
...
...
paddle/fluid/operators/reduce_ops/reduce_functor_op.h
浏览文件 @
9c5d5665
...
@@ -24,11 +24,11 @@ limitations under the License. */
...
@@ -24,11 +24,11 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
kp
ds
=
paddle
::
operators
::
kernel_primitives
::
detail
s
;
namespace
kp
s
=
paddle
::
operators
::
kernel_primitive
s
;
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomMin
{
struct
CustomMin
{
using
Transformer
=
kp
d
s
::
IdentityFunctor
<
Tx
>
;
using
Transformer
=
kps
::
IdentityFunctor
<
Tx
>
;
inline
Ty
initial
()
{
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
std
::
numeric_limits
<
Ty
>::
max
());
return
static_cast
<
Ty
>
(
std
::
numeric_limits
<
Ty
>::
max
());
...
@@ -41,7 +41,7 @@ struct CustomMin {
...
@@ -41,7 +41,7 @@ struct CustomMin {
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomMax
{
struct
CustomMax
{
using
Transformer
=
kp
d
s
::
IdentityFunctor
<
Tx
>
;
using
Transformer
=
kps
::
IdentityFunctor
<
Tx
>
;
inline
Ty
initial
()
{
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
std
::
numeric_limits
<
Ty
>::
lowest
());
return
static_cast
<
Ty
>
(
std
::
numeric_limits
<
Ty
>::
lowest
());
...
@@ -55,7 +55,7 @@ struct CustomMax {
...
@@ -55,7 +55,7 @@ struct CustomMax {
// for cub::Reduce
// for cub::Reduce
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomSum
{
struct
CustomSum
{
using
Transformer
=
kp
d
s
::
IdentityFunctor
<
Tx
,
Ty
>
;
using
Transformer
=
kps
::
IdentityFunctor
<
Tx
,
Ty
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
...
@@ -66,7 +66,7 @@ struct CustomSum {
...
@@ -66,7 +66,7 @@ struct CustomSum {
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomMean
{
struct
CustomMean
{
using
Transformer
=
kp
d
s
::
DivideFunctor
<
Tx
>
;
using
Transformer
=
kps
::
DivideFunctor
<
Tx
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
...
@@ -77,7 +77,7 @@ struct CustomMean {
...
@@ -77,7 +77,7 @@ struct CustomMean {
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomMul
{
struct
CustomMul
{
using
Transformer
=
kp
d
s
::
IdentityFunctor
<
Tx
>
;
using
Transformer
=
kps
::
IdentityFunctor
<
Tx
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
1.0
f
);
}
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
1.0
f
);
}
...
@@ -88,7 +88,7 @@ struct CustomMul {
...
@@ -88,7 +88,7 @@ struct CustomMul {
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomLogicalOr
{
struct
CustomLogicalOr
{
using
Transformer
=
kp
d
s
::
IdentityFunctor
<
Tx
>
;
using
Transformer
=
kps
::
IdentityFunctor
<
Tx
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
false
);
}
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
false
);
}
...
@@ -99,7 +99,7 @@ struct CustomLogicalOr {
...
@@ -99,7 +99,7 @@ struct CustomLogicalOr {
template
<
typename
Tx
,
typename
Ty
=
Tx
>
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
CustomLogicalAnd
{
struct
CustomLogicalAnd
{
using
Transformer
=
kp
d
s
::
IdentityFunctor
<
Tx
>
;
using
Transformer
=
kps
::
IdentityFunctor
<
Tx
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
true
);
}
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
true
);
}
...
...
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
浏览文件 @
9c5d5665
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录