Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ccc7c358
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ccc7c358
编写于
3月 12, 2019
作者:
T
Tao Luo
提交者:
GitHub
3月 12, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #16104 from tensor-tang/refine/jit
refine jitkernels and test
上级
c49b7855
14d87112
变更
45
展开全部
隐藏空白更改
内联
并排
Showing
45 changed file
with
1616 addition
and
1273 deletion
+1616
-1273
paddle/fluid/operators/crf_decoding_op.h
paddle/fluid/operators/crf_decoding_op.h
+3
-2
paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+3
-2
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+6
-4
paddle/fluid/operators/fused/fusion_gru_op.cc
paddle/fluid/operators/fused/fusion_gru_op.cc
+26
-23
paddle/fluid/operators/fused/fusion_lstm_op.cc
paddle/fluid/operators/fused/fusion_lstm_op.cc
+28
-26
paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+4
-2
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+1
-1
paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+12
-8
paddle/fluid/operators/jit/CMakeLists.txt
paddle/fluid/operators/jit/CMakeLists.txt
+1
-1
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+141
-148
paddle/fluid/operators/jit/gen/act.cc
paddle/fluid/operators/jit/gen/act.cc
+8
-7
paddle/fluid/operators/jit/gen/blas.cc
paddle/fluid/operators/jit/gen/blas.cc
+3
-2
paddle/fluid/operators/jit/gen/embseqpool.cc
paddle/fluid/operators/jit/gen/embseqpool.cc
+2
-1
paddle/fluid/operators/jit/gen/gru.cc
paddle/fluid/operators/jit/gen/gru.cc
+2
-1
paddle/fluid/operators/jit/gen/hopv.cc
paddle/fluid/operators/jit/gen/hopv.cc
+2
-1
paddle/fluid/operators/jit/gen/jitcode.h
paddle/fluid/operators/jit/gen/jitcode.h
+1
-1
paddle/fluid/operators/jit/gen/lstm.cc
paddle/fluid/operators/jit/gen/lstm.cc
+2
-1
paddle/fluid/operators/jit/gen/matmul.cc
paddle/fluid/operators/jit/gen/matmul.cc
+2
-2
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+2
-1
paddle/fluid/operators/jit/gen/sgd.cc
paddle/fluid/operators/jit/gen/sgd.cc
+2
-1
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+1
-1
paddle/fluid/operators/jit/gen_base.cc
paddle/fluid/operators/jit/gen_base.cc
+1
-1
paddle/fluid/operators/jit/gen_base.h
paddle/fluid/operators/jit/gen_base.h
+4
-3
paddle/fluid/operators/jit/helper.h
paddle/fluid/operators/jit/helper.h
+116
-49
paddle/fluid/operators/jit/kernel_base.h
paddle/fluid/operators/jit/kernel_base.h
+73
-25
paddle/fluid/operators/jit/kernel_key.cc
paddle/fluid/operators/jit/kernel_key.cc
+17
-41
paddle/fluid/operators/jit/kernel_key.h
paddle/fluid/operators/jit/kernel_key.h
+1
-1
paddle/fluid/operators/jit/kernel_pool.h
paddle/fluid/operators/jit/kernel_pool.h
+4
-3
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+1
-1
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+3
-3
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+1
-1
paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+3
-2
paddle/fluid/operators/jit/more/mix/mix.cc
paddle/fluid/operators/jit/more/mix/mix.cc
+41
-43
paddle/fluid/operators/jit/more/mix/mix.h
paddle/fluid/operators/jit/more/mix/mix.h
+14
-14
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+40
-39
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+23
-26
paddle/fluid/operators/jit/refer/refer.cc
paddle/fluid/operators/jit/refer/refer.cc
+36
-44
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+37
-43
paddle/fluid/operators/jit/registry.h
paddle/fluid/operators/jit/registry.h
+3
-2
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+932
-682
paddle/fluid/operators/layer_norm_op.h
paddle/fluid/operators/layer_norm_op.h
+2
-2
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+5
-6
paddle/fluid/operators/math/sequence_pooling.cc
paddle/fluid/operators/math/sequence_pooling.cc
+2
-2
paddle/fluid/operators/math/softmax_impl.h
paddle/fluid/operators/math/softmax_impl.h
+1
-2
paddle/fluid/operators/optimizers/sgd_op.h
paddle/fluid/operators/optimizers/sgd_op.h
+4
-2
未找到文件。
paddle/fluid/operators/crf_decoding_op.h
浏览文件 @
ccc7c358
...
@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
...
@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
Tensor
track
;
Tensor
track
;
int
*
track_value
=
int
*
track_value
=
track
.
mutable_data
<
int
>
(
emission_dims
,
platform
::
CPUPlace
());
track
.
mutable_data
<
int
>
(
emission_dims
,
platform
::
CPUPlace
());
auto
ker
=
jit
::
Get
<
jit
::
kCRFDecoding
,
jit
::
CRFDecodingTuples
<
T
>
,
auto
ker
=
platform
::
CPUPlace
>
(
tag_num
);
jit
::
KernelFuncs
<
jit
::
CRFDecodingTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
()
.
At
(
tag_num
);
ker
(
static_cast
<
int
>
(
seq_len
),
x
,
w
,
alpha_value
,
track_value
,
tag_num
);
ker
(
static_cast
<
int
>
(
seq_len
),
x
,
w
,
alpha_value
,
track_value
,
tag_num
);
T
max_score
=
-
std
::
numeric_limits
<
T
>::
max
();
T
max_score
=
-
std
::
numeric_limits
<
T
>::
max
();
int
max_i
=
0
;
int
max_i
=
0
;
...
...
paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
浏览文件 @
ccc7c358
...
@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
...
@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
constexpr
int
simd_width
=
16
;
constexpr
int
simd_width
=
16
;
int
C
=
c
/
simd_width
;
int
C
=
c
/
simd_width
;
auto
multiply
=
jit
::
Get
<
jit
::
kNCHW16CMulNC
,
jit
::
NCHW16CMulNCTuples
<
T
>
,
auto
multiply
=
jit
::
KernelFuncs
<
jit
::
NCHW16CMulNCTuple
<
T
>
,
platform
::
CPUPlace
>
(
0
);
platform
::
CPUPlace
>::
Cache
()
.
At
(
0
);
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
for
(
int
ni
=
0
;
ni
<
n
;
ni
++
)
{
for
(
int
ni
=
0
;
ni
<
n
;
ni
++
)
{
for
(
int
ci
=
0
;
ci
<
C
;
ci
++
)
{
for
(
int
ci
=
0
;
ci
<
C
;
ci
++
)
{
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
浏览文件 @
ccc7c358
...
@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
...
@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
out_width
,
jit
::
SeqPoolType
::
kSum
);
out_width
,
jit
::
SeqPoolType
::
kSum
);
for
(
size_t
i
=
0
;
i
!=
ids_lod
.
size
()
-
1
;
++
i
)
{
for
(
size_t
i
=
0
;
i
!=
ids_lod
.
size
()
-
1
;
++
i
)
{
attr
.
index_height
=
ids_lod
[
i
+
1
]
-
ids_lod
[
i
];
attr
.
index_height
=
ids_lod
[
i
+
1
]
-
ids_lod
[
i
];
auto
emb_seqpool
=
jit
::
Get
<
jit
::
kEmbSeqPool
,
jit
::
EmbSeqPoolTuples
<
T
>
,
auto
emb_seqpool
=
platform
::
CPUPlace
>
(
attr
);
jit
::
KernelFuncs
<
jit
::
EmbSeqPoolTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
()
.
At
(
attr
);
emb_seqpool
(
table
,
ids
+
ids_lod
[
i
]
*
idx_width
,
output
+
i
*
out_width
,
emb_seqpool
(
table
,
ids
+
ids_lod
[
i
]
*
idx_width
,
output
+
i
*
out_width
,
&
attr
);
&
attr
);
}
}
...
@@ -135,8 +136,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
...
@@ -135,8 +136,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
vbroadcast
=
jit
::
Get
<
jit
::
kVBroadcast
,
jit
::
VBroadcastTuples
<
T
>
,
auto
vbroadcast
=
platform
::
CPUPlace
>
(
out_width
);
jit
::
KernelFuncs
<
jit
::
VBroadcastTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
()
.
At
(
out_width
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
const
T
*
src
=
d_output_data
+
i
*
out_width
;
const
T
*
src
=
d_output_data
+
i
*
out_width
;
...
...
paddle/fluid/operators/fused/fusion_gru_op.cc
浏览文件 @
ccc7c358
...
@@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel<T> {
const int total_T = x_dims[0]; \
const int total_T = x_dims[0]; \
const int D3 = wh_dims[1]
const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \
#define INIT_OTHER_DEFINES \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const int D2 = D * 2; \
const jit::gru_attr_t attr( \
const jit::gru_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \
jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \
jit::gru_t one_step; \
jit::gru_t one_step; \
auto ComputeH1 = \
auto ComputeH1 = \
jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At( \
auto ComputeHtPart1 = \
attr); \
jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
auto ComputeHtPart1 = \
auto ComputeHtPart2 = \
jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
.At(attr); \
const T* x_data = x->data<T>(); \
auto ComputeHtPart2 = \
const T* wx_data = wx->data<T>(); \
jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
const T* wh_data = wh->data<T>(); \
.At(attr); \
auto place = ctx.GetPlace(); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
auto place = ctx.GetPlace(); \
T* xx_data = xx->mutable_data<T>(place)
T* xx_data = xx->mutable_data<T>(place)
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
...
...
paddle/fluid/operators/fused/fusion_lstm_op.cc
浏览文件 @
ccc7c358
...
@@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const int D = wh_dims[0]; \
const int D = wh_dims[0]; \
const int D4 = wh_dims[1]
const int D4 = wh_dims[1]
#define INIT_OTHER_DEFINES \
#define INIT_OTHER_DEFINES \
const T* x_data = x->data<T>(); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/
\
/* diagonal weight*/
\
const T* wp_data = bias->data<T>() + D4; \
const T* wp_data = bias->data<T>() + D4; \
/* for peephole only*/
\
/* for peephole only*/
\
T* checked_cell_data = nullptr; \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
} \
} \
const jit::lstm_attr_t attr( \
const jit::lstm_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")), \
use_peepholes); \
use_peepholes); \
jit::lstm_t one_step; \
jit::lstm_t one_step; \
one_step.wp = wp_data; \
one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \
one_step.checked = checked_cell_data; \
auto ComputeC1H1 = \
auto ComputeC1H1 = \
jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
auto ComputeCtHt = \
attr); \
jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
auto ComputeCtHt = \
jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
attr)
// Wh GEMM
// Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \
#define GEMM_WH_ADDON(bs, prev, out) \
...
...
paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
浏览文件 @
ccc7c358
...
@@ -82,9 +82,11 @@ template <typename T>
...
@@ -82,9 +82,11 @@ template <typename T>
static
void
fc_relu
(
const
T
*
x
,
const
T
*
w
,
const
T
*
b
,
T
*
y
,
static
void
fc_relu
(
const
T
*
x
,
const
T
*
w
,
const
T
*
b
,
T
*
y
,
const
jit
::
matmul_attr_t
&
attr
)
{
const
jit
::
matmul_attr_t
&
attr
)
{
auto
matmul
=
auto
matmul
=
jit
::
Get
<
jit
::
kMatMul
,
jit
::
MatMulTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
jit
::
KernelFuncs
<
jit
::
MatMulTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
);
auto
addbias_relu
=
auto
addbias_relu
=
jit
::
Get
<
jit
::
kVAddRelu
,
jit
::
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
.
n
);
jit
::
KernelFuncs
<
jit
::
VAddReluTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
.
n
);
matmul
(
x
,
w
,
y
,
&
attr
);
matmul
(
x
,
w
,
y
,
&
attr
);
T
*
dst
=
y
;
T
*
dst
=
y
;
for
(
int
i
=
0
;
i
<
attr
.
m
;
++
i
)
{
for
(
int
i
=
0
;
i
<
attr
.
m
;
++
i
)
{
...
...
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
浏览文件 @
ccc7c358
...
@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
...
@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
attr
.
type
=
jit
::
SeqPoolType
::
kSqrt
;
attr
.
type
=
jit
::
SeqPoolType
::
kSqrt
;
}
}
auto
seqpool
=
auto
seqpool
=
jit
::
Get
<
jit
::
kSeqPool
,
jit
::
SeqPoolTuples
<
T
>
,
platform
::
CPUPlace
>
(
jit
::
KernelFuncs
<
jit
::
SeqPoolTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
);
attr
);
size_t
n
=
ins
.
size
();
size_t
n
=
ins
.
size
();
size_t
dst_step_size
=
n
*
w
;
size_t
dst_step_size
=
n
*
w
;
...
...
paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
浏览文件 @
ccc7c358
...
@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
...
@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
int
o_numel
=
attr
.
m
*
attr
.
n
;
int
o_numel
=
attr
.
m
*
attr
.
n
;
auto
vsquare_x
=
auto
vsquare_x
=
jit
::
Get
<
jit
::
kVSquare
,
jit
::
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
.
m
*
jit
::
KernelFuncs
<
jit
::
VSquareTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
.
k
);
attr
.
m
*
attr
.
k
);
auto
vsquare_y
=
auto
vsquare_y
=
jit
::
Get
<
jit
::
kVSquare
,
jit
::
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
.
k
*
jit
::
KernelFuncs
<
jit
::
VSquareTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
.
n
);
attr
.
k
*
attr
.
n
);
auto
vsquare_xy
=
auto
vsquare_xy
=
jit
::
Get
<
jit
::
kVSquare
,
jit
::
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
o_numel
);
jit
::
KernelFuncs
<
jit
::
VSquareTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
o_numel
);
auto
vsub
=
auto
vsub
=
jit
::
Get
<
jit
::
kVSub
,
jit
::
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
o_numel
);
jit
::
KernelFuncs
<
jit
::
VSubTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
o_numel
);
auto
vscal
=
auto
vscal
=
jit
::
Get
<
jit
::
kVScal
,
jit
::
AXYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
o_numel
);
jit
::
KernelFuncs
<
jit
::
VScalTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
o_numel
);
auto
matmul
=
auto
matmul
=
jit
::
Get
<
jit
::
kMatMul
,
jit
::
MatMulTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
jit
::
KernelFuncs
<
jit
::
MatMulTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
y_data
=
y
->
data
<
T
>
();
const
T
*
y_data
=
y
->
data
<
T
>
();
...
...
paddle/fluid/operators/jit/CMakeLists.txt
浏览文件 @
ccc7c358
...
@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
...
@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
file
(
APPEND
${
jit_file
}
"
\#
include
\"
paddle/fluid/operators/jit/helper.h
\"\n
"
)
file
(
APPEND
${
jit_file
}
"
\#
include
\"
paddle/fluid/operators/jit/helper.h
\"\n
"
)
file
(
APPEND
${
jit_file
}
"
\#
include
\"
paddle/fluid/operators/jit/registry.h
\"\n\n
"
)
file
(
APPEND
${
jit_file
}
"
\#
include
\"
paddle/fluid/operators/jit/registry.h
\"\n\n
"
)
set
(
JIT_KERNEL_DEPS cpu_info cblas gflags enforce place
)
set
(
JIT_KERNEL_DEPS cpu_info cblas gflags enforce place
xxhash
)
file
(
GLOB jit_kernel_cc_srcs RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*.cc"
)
file
(
GLOB jit_kernel_cc_srcs RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*.cc"
)
list
(
REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc
)
list
(
REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc
)
...
...
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
ccc7c358
...
@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
...
@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \
InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \
void BenchJITKernel_##name##_##dtype##_##place##_::Run()
void BenchJITKernel_##name##_##dtype##_##place##_::Run()
#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
void
RUN_ALL_BENCHMARK
()
{
void
RUN_ALL_BENCHMARK
()
{
for
(
auto
p
:
g_all_benchmarks
)
{
for
(
auto
p
:
g_all_benchmarks
)
{
if
(
!
FLAGS_filter
.
empty
()
&&
FLAGS_filter
!=
p
->
Name
())
{
if
(
!
FLAGS_filter
.
empty
()
&&
FLAGS_filter
!=
p
->
Name
())
{
...
@@ -90,11 +88,11 @@ std::vector<int> TestSizes() {
...
@@ -90,11 +88,11 @@ std::vector<int> TestSizes() {
return
s
;
return
s
;
}
}
template
<
typename
KernelTuple
s
,
typename
...
Args
>
template
<
typename
KernelTuple
,
typename
...
Args
>
struct
BenchFunc
{
struct
BenchFunc
{
// return this function avg time
// return this function avg time
// TODO(TJ): clear cache every time
// TODO(TJ): clear cache every time
double
operator
()(
const
typename
KernelTuple
s
::
func_type
tgt
,
Args
...
args
)
{
double
operator
()(
const
typename
KernelTuple
::
func_type
tgt
,
Args
...
args
)
{
for
(
int
i
=
0
;
i
<
FLAGS_burning
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_burning
;
++
i
)
{
tgt
(
args
...);
tgt
(
args
...);
}
}
...
@@ -109,40 +107,17 @@ struct BenchFunc {
...
@@ -109,40 +107,17 @@ struct BenchFunc {
namespace
jit
=
paddle
::
operators
::
jit
;
namespace
jit
=
paddle
::
operators
::
jit
;
template
<
jit
::
KernelType
KT
,
typename
KernelTuples
,
typename
PlaceType
,
template
<
typename
KernelTuple
,
typename
PlaceType
,
typename
...
Args
>
typename
...
Args
>
void
BenchAllImpls
(
const
typename
KernelTuple
::
attr_type
&
attr
,
Args
...
args
)
{
void
BenchAllImpls
(
const
typename
KernelTuples
::
attr_type
&
attr
,
Args
...
args
)
{
BenchFunc
<
KernelTuple
,
Args
...
>
benchmark
;
BenchFunc
<
KernelTuples
,
Args
...
>
benchmark
;
std
::
vector
<
std
::
pair
<
std
::
string
,
double
>>
infos
;
std
::
vector
<
std
::
pair
<
std
::
string
,
double
>>
infos
;
// test refer
auto
funcs
=
jit
::
GetAllCandidateFuncsWithTypes
<
KernelTuple
,
PlaceType
>
(
attr
);
auto
refer
=
jit
::
GetRefer
<
KT
,
KernelTuples
>
();
for
(
auto
f
:
funcs
)
{
if
(
!
refer
)
{
infos
.
push_back
(
std
::
make_pair
(
f
.
first
,
benchmark
(
f
.
second
,
args
...)));
LOG
(
FATAL
)
<<
"Refer can not be empty!"
;
}
}
infos
.
push_back
(
std
::
make_pair
(
"Refer"
,
benchmark
(
refer
,
args
...)));
// test jitcode
auto
jitcode
=
jit
::
GetJitCode
<
KT
,
KernelTuples
,
PlaceType
>
(
attr
);
if
(
jitcode
)
{
infos
.
push_back
(
std
::
make_pair
(
"JitCode"
,
benchmark
(
jitcode
,
args
...)));
}
// test all impls in more
jit
::
KernelKey
kkey
(
KT
,
PlaceType
());
auto
&
pool
=
jit
::
KernelPool
().
Instance
().
AllKernels
();
auto
iter
=
pool
.
find
(
kkey
);
if
(
iter
!=
pool
.
end
())
{
auto
&
impls
=
iter
->
second
;
for
(
auto
&
impl
:
impls
)
{
auto
i
=
dynamic_cast
<
const
jit
::
KernelMore
<
KernelTuples
>*>
(
impl
.
get
());
if
(
i
&&
i
->
UseMe
(
attr
))
{
auto
more
=
i
->
GetFunc
();
infos
.
push_back
(
std
::
make_pair
(
i
->
ImplType
(),
benchmark
(
more
,
args
...)));
}
}
}
// Test result from Get function
// Test result from Get function
auto
tgt
=
jit
::
Get
<
KT
,
KernelTuples
,
PlaceType
>
(
attr
);
auto
tgt
=
jit
::
KernelFuncs
<
KernelTuple
,
PlaceType
>::
Cache
().
At
(
attr
);
if
(
!
tgt
)
{
if
(
!
tgt
)
{
LOG
(
FATAL
)
<<
"Target can not be empty!"
;
LOG
(
FATAL
)
<<
"Target can not be empty!"
;
}
}
...
@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
...
@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
// print
// print
std
::
ostringstream
loginfos
;
std
::
ostringstream
loginfos
;
loginfos
<<
"Kernel Type "
<<
jit
::
to_string
(
KT
)
<<
": "
<<
attr
<<
": "
;
loginfos
<<
"Kernel Type "
<<
jit
::
to_string
(
KernelTuple
::
kernel_type
)
<<
": "
<<
attr
<<
": "
;
for
(
auto
pair
:
infos
)
{
for
(
auto
pair
:
infos
)
{
loginfos
<<
pair
.
first
<<
" takes "
<<
pair
.
second
<<
" us; "
;
loginfos
<<
pair
.
first
<<
" takes "
<<
pair
.
second
<<
" us; "
;
}
}
...
@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
...
@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
using
Tensor
=
paddle
::
framework
::
Tensor
;
using
Tensor
=
paddle
::
framework
::
Tensor
;
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchXYZNKernel
()
{
void
BenchKernelXYZN
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
Tensor
x
,
y
,
z
;
Tensor
x
,
y
,
z
;
x
.
Resize
({
d
});
x
.
Resize
({
d
});
...
@@ -171,16 +148,16 @@ void BenchXYZNKernel() {
...
@@ -171,16 +148,16 @@ void BenchXYZNKernel() {
T
*
z_data
=
z
.
mutable_data
<
T
>
(
PlaceType
());
T
*
z_data
=
z
.
mutable_data
<
T
>
(
PlaceType
());
RandomVec
<
T
>
(
d
,
x_data
);
RandomVec
<
T
>
(
d
,
x_data
);
RandomVec
<
T
>
(
d
,
y_data
);
RandomVec
<
T
>
(
d
,
y_data
);
BenchAllImpls
<
K
T
,
jit
::
XYZNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
<
T
>
()
,
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
z_data
,
y
.
data
<
T
>
(),
z_data
,
d
);
d
);
// test inplace
// test inplace
BenchAllImpls
<
KT
,
jit
::
XYZNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
z_data
,
BenchAllImpls
<
KernelTuple
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
z_data
,
z_data
,
d
);
z_data
,
d
);
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchAXYNKernel
()
{
void
BenchKernelAXYN
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
const
T
a
=
static_cast
<
T
>
(
3
);
const
T
a
=
static_cast
<
T
>
(
3
);
Tensor
x
,
y
;
Tensor
x
,
y
;
...
@@ -189,26 +166,26 @@ void BenchAXYNKernel() {
...
@@ -189,26 +166,26 @@ void BenchAXYNKernel() {
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
RandomVec
<
T
>
(
d
,
x_data
);
RandomVec
<
T
>
(
d
,
x_data
);
BenchAllImpls
<
KT
,
jit
::
AXYNTuples
<
T
>
,
PlaceType
>
(
d
,
&
a
,
x
.
data
<
T
>
(),
y_data
,
BenchAllImpls
<
KernelTuple
,
PlaceType
>
(
d
,
&
a
,
x
.
data
<
T
>
(),
y_data
,
d
);
d
);
// test inplace
// test inplace
BenchAllImpls
<
KT
,
jit
::
AXYNTuples
<
T
>
,
PlaceType
>
(
d
,
&
a
,
x
.
data
<
T
>
(),
x_data
,
BenchAllImpls
<
KernelTuple
,
PlaceType
>
(
d
,
&
a
,
x
.
data
<
T
>
(),
x_data
,
d
);
d
);
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchXRNKernel
()
{
void
BenchKernelXRN
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
Tensor
x
;
Tensor
x
;
RandomVec
<
T
>
(
d
,
x
.
mutable_data
<
T
>
({
d
},
PlaceType
()));
RandomVec
<
T
>
(
d
,
x
.
mutable_data
<
T
>
({
d
},
PlaceType
()));
T
res
;
T
res
;
BenchAllImpls
<
K
T
,
jit
::
XRNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
&
res
,
d
);
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
&
res
,
d
);
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchXYNKernel
()
{
void
BenchKernelXYN
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
Tensor
x
,
y
;
Tensor
x
,
y
;
x
.
Resize
({
d
});
x
.
Resize
({
d
});
...
@@ -216,12 +193,13 @@ void BenchXYNKernel() {
...
@@ -216,12 +193,13 @@ void BenchXYNKernel() {
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
RandomVec
<
T
>
(
d
,
x_data
);
RandomVec
<
T
>
(
d
,
x_data
);
BenchAllImpls
<
K
T
,
jit
::
XYNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
y_data
,
d
);
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
y_data
,
d
);
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchLSTMKernel
()
{
void
BenchKernelLSTM
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
bool
use_peephole
:
{
true
,
false
})
{
for
(
bool
use_peephole
:
{
true
,
false
})
{
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
const
jit
::
lstm_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
,
jit
::
kVTanh
,
const
jit
::
lstm_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
,
jit
::
kVTanh
,
...
@@ -252,13 +230,14 @@ void BenchLSTMKernel() {
...
@@ -252,13 +230,14 @@ void BenchLSTMKernel() {
step
.
wp
=
wp_data
;
step
.
wp
=
wp_data
;
step
.
checked
=
checked_data
;
step
.
checked
=
checked_data
;
}
}
BenchAllImpls
<
K
T
,
jit
::
LSTMTuples
<
T
>
,
PlaceType
>
(
attr
,
&
step
,
&
attr
);
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
attr
,
&
step
,
&
attr
);
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchGRUKernel
()
{
void
BenchKernelGRU
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
const
jit
::
gru_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
const
jit
::
gru_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
auto
place
=
PlaceType
();
auto
place
=
PlaceType
();
...
@@ -275,12 +254,13 @@ void BenchGRUKernel() {
...
@@ -275,12 +254,13 @@ void BenchGRUKernel() {
step
.
gates
=
x_data
;
step
.
gates
=
x_data
;
step
.
ht_1
=
ht_1_data
;
step
.
ht_1
=
ht_1_data
;
step
.
ht
=
ht_data
;
step
.
ht
=
ht_data
;
BenchAllImpls
<
K
T
,
jit
::
GRUTuples
<
T
>
,
PlaceType
>
(
attr
,
&
step
,
&
attr
);
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
attr
,
&
step
,
&
attr
);
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchSeqPoolKernel
()
{
void
BenchKernelSeqPool
()
{
using
T
=
typename
KernelTuple
::
data_type
;
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
,
jit
::
SeqPoolType
::
kAvg
,
jit
::
SeqPoolType
::
kSqrt
};
jit
::
SeqPoolType
::
kSum
,
jit
::
SeqPoolType
::
kAvg
,
jit
::
SeqPoolType
::
kSqrt
};
for
(
auto
type
:
pool_types
)
{
for
(
auto
type
:
pool_types
)
{
...
@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() {
...
@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() {
RandomVec
<
T
>
(
h
*
w
,
x
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
h
*
w
,
x
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
<
T
>
();
const
T
*
x_data
=
x
.
data
<
T
>
();
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
>
(
attr
,
x_data
,
BenchAllImpls
<
KernelTuple
,
PlaceType
>
(
attr
,
x_data
,
y_data
,
&
attr
);
y_data
,
&
attr
);
}
}
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchEmbSeqPoolKernel
()
{
void
BenchKernelEmbSeqPool
()
{
using
T
=
typename
KernelTuple
::
data_type
;
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
};
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
};
int64_t
tbl_h
=
1e4
;
int64_t
tbl_h
=
1e4
;
for
(
int
tbl_w
:
{
10
,
16
,
256
})
{
for
(
int
tbl_w
:
{
10
,
16
,
256
})
{
...
@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() {
...
@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() {
tbl_h
-
1
);
tbl_h
-
1
);
const
int64_t
*
idx_data
=
idx
.
data
<
int64_t
>
();
const
int64_t
*
idx_data
=
idx
.
data
<
int64_t
>
();
T
*
o_data
=
out
.
mutable_data
<
T
>
(
PlaceType
());
T
*
o_data
=
out
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
K
T
,
jit
::
EmbSeqPoolTuples
<
T
>
,
PlaceType
>
(
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
attr
,
table_data
,
idx_data
,
attr
,
table_data
,
idx_data
,
o_data
,
&
attr
);
o_data
,
&
attr
);
}
}
}
}
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchSgdKernel
()
{
void
BenchKernelSgd
()
{
using
T
=
typename
KernelTuple
::
data_type
;
const
T
lr
=
0.1
;
const
T
lr
=
0.1
;
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
...
@@ -364,15 +345,16 @@ void BenchSgdKernel() {
...
@@ -364,15 +345,16 @@ void BenchSgdKernel() {
const
T
*
grad_data
=
grad
.
data
<
T
>
();
const
T
*
grad_data
=
grad
.
data
<
T
>
();
const
int64_t
*
rows_data
=
rows
.
data
();
const
int64_t
*
rows_data
=
rows
.
data
();
jit
::
sgd_attr_t
attr
(
param_h
,
grad_w
,
rows_size
,
grad_w
,
rows_size
);
jit
::
sgd_attr_t
attr
(
param_h
,
grad_w
,
rows_size
,
grad_w
,
rows_size
);
BenchAllImpls
<
K
T
,
jit
::
SgdTuples
<
T
>
,
PlaceType
>
(
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
attr
,
&
lr
,
param_data
,
grad_data
,
attr
,
&
lr
,
param_data
,
grad_data
,
rows_data
,
param_data
,
&
attr
);
rows_data
,
param_data
,
&
attr
);
}
}
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchMatMulKernel
()
{
void
BenchKernelMatMul
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
m
:
{
1
,
2
,
3
,
4
})
{
for
(
int
m
:
{
1
,
2
,
3
,
4
})
{
for
(
int
n
:
TestSizes
())
{
for
(
int
n
:
TestSizes
())
{
for
(
int
k
:
TestSizes
())
{
for
(
int
k
:
TestSizes
())
{
...
@@ -386,15 +368,16 @@ void BenchMatMulKernel() {
...
@@ -386,15 +368,16 @@ void BenchMatMulKernel() {
const
T
*
b_data
=
b
.
data
<
T
>
();
const
T
*
b_data
=
b
.
data
<
T
>
();
T
*
c_data
=
c
.
mutable_data
<
T
>
(
PlaceType
());
T
*
c_data
=
c
.
mutable_data
<
T
>
(
PlaceType
());
const
jit
::
matmul_attr_t
attr
{
m
,
n
,
k
};
const
jit
::
matmul_attr_t
attr
{
m
,
n
,
k
};
BenchAllImpls
<
K
T
,
jit
::
MatMulTuples
<
T
>
,
PlaceType
>
(
attr
,
a_data
,
b
_data
,
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
attr
,
a_data
,
b_data
,
c
_data
,
c_data
,
&
attr
);
&
attr
);
}
}
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchSoftmaxKernel
()
{
void
BenchKernelSoftmax
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int
bs
:
{
1
,
2
,
10
})
{
for
(
int
bs
:
{
1
,
2
,
10
})
{
for
(
int
n
:
TestSizes
())
{
for
(
int
n
:
TestSizes
())
{
Tensor
x
,
y
;
Tensor
x
,
y
;
...
@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() {
...
@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() {
RandomVec
<
T
>
(
bs
*
n
,
x
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
bs
*
n
,
x
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
<
T
>
();
const
T
*
x_data
=
x
.
data
<
T
>
();
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
KT
,
jit
::
SoftmaxTuples
<
T
>
,
PlaceType
>
(
n
,
x_data
,
y_data
,
n
,
BenchAllImpls
<
KernelTuple
,
PlaceType
>
(
n
,
x_data
,
y_data
,
n
,
bs
);
bs
);
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchLayerNormKernel
()
{
void
BenchKernelLayerNorm
()
{
using
T
=
typename
KernelTuple
::
data_type
;
const
T
epsilon
=
9.99999975e-06
;
const
T
epsilon
=
9.99999975e-06
;
for
(
int
n
:
{
1
,
2
,
10
})
{
for
(
int
n
:
{
1
,
2
,
10
})
{
for
(
int
x_dim_0
:
{
1
,
9
,
17
,
50
})
{
for
(
int
x_dim_0
:
{
1
,
9
,
17
,
50
})
{
...
@@ -439,16 +422,17 @@ void BenchLayerNormKernel() {
...
@@ -439,16 +422,17 @@ void BenchLayerNormKernel() {
T
*
var_data
=
var
.
data
<
T
>
();
T
*
var_data
=
var
.
data
<
T
>
();
T
*
out_data
=
out
.
mutable_data
<
T
>
(
PlaceType
());
T
*
out_data
=
out
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
K
T
,
jit
::
LayerNormTuples
<
T
>
,
PlaceType
>
(
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
right
,
x_data
,
out_data
,
right
,
x_data
,
out_data
,
mean_data
,
var_data
,
scale_data
,
bias
_data
,
mean_data
,
var_data
,
scale
_data
,
left
,
epsilon
,
right
);
bias_data
,
left
,
epsilon
,
right
);
}
}
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchCRFDecodingKernel
()
{
void
BenchKernelCRFDecoding
()
{
using
T
=
typename
KernelTuple
::
data_type
;
constexpr
int
state_trans_base_idx
=
2
;
constexpr
int
state_trans_base_idx
=
2
;
for
(
int
seq_len
:
{
1
,
11
,
17
,
50
})
{
for
(
int
seq_len
:
{
1
,
11
,
17
,
50
})
{
for
(
int
tag_num
:
TestSizes
())
{
for
(
int
tag_num
:
TestSizes
())
{
...
@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() {
...
@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() {
T
*
alpha_data
=
alpha
.
mutable_data
<
T
>
(
PlaceType
());
T
*
alpha_data
=
alpha
.
mutable_data
<
T
>
(
PlaceType
());
int
*
track_data
=
track
.
mutable_data
<
int
>
(
PlaceType
());
int
*
track_data
=
track
.
mutable_data
<
int
>
(
PlaceType
());
BenchAllImpls
<
K
T
,
jit
::
CRFDecodingTuples
<
T
>
,
PlaceType
>
(
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
tag_num
,
seq_len
,
x_data
,
w_data
,
tag_num
,
seq_len
,
x_data
,
w_data
,
alpha_data
,
track_data
,
tag_num
);
alpha_data
,
track_data
,
tag_num
);
}
}
}
}
}
}
template
<
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchVBroadcastKernel
()
{
void
BenchKernelVBroadcast
()
{
using
T
=
typename
KernelTuple
::
data_type
;
for
(
int64_t
w
:
{
1
,
16
,
64
,
100
,
256
})
{
for
(
int64_t
w
:
{
1
,
16
,
64
,
100
,
256
})
{
Tensor
x
;
Tensor
x
;
x
.
Resize
({
w
});
x
.
Resize
({
w
});
...
@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() {
...
@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() {
Tensor
y
;
Tensor
y
;
y
.
Resize
({
h
*
w
});
y
.
Resize
({
h
*
w
});
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
K
T
,
jit
::
VBroadcastTuples
<
T
>
,
PlaceType
>
(
BenchAllImpls
<
K
ernelTuple
,
PlaceType
>
(
w
,
x_data
,
y_data
,
w
,
x_data
,
y_data
,
static_cast
<
int64_t
>
(
h
),
w
);
static_cast
<
int64_t
>
(
h
),
w
);
}
}
}
}
}
}
using
T
=
float
;
#define BenchKernelVMul BenchKernelXYZN
using
CPUPlace
=
paddle
::
platform
::
CPUPlace
;
#define BenchKernelVAdd BenchKernelXYZN
#define BenchKernelVAddRelu BenchKernelXYZN
#define BenchKernelVSub BenchKernelXYZN
// xyzn
#define BenchKernelVScal BenchKernelAXYN
BENCH_FP32_CPU
(
kVMul
)
{
BenchXYZNKernel
<
jit
::
kVMul
,
T
,
CPUPlace
>
();
}
#define BenchKernelVAddBias BenchKernelAXYN
BENCH_FP32_CPU
(
kVAdd
)
{
BenchXYZNKernel
<
jit
::
kVAdd
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVAddRelu
)
{
BenchXYZNKernel
<
jit
::
kVAddRelu
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVSub
)
{
BenchXYZNKernel
<
jit
::
kVSub
,
T
,
CPUPlace
>
();
}
// axyn
#define BenchKernelVRelu BenchKernelXYN
BENCH_FP32_CPU
(
kVScal
)
{
BenchAXYNKernel
<
jit
::
kVScal
,
T
,
CPUPlace
>
();
}
#define BenchKernelVIdentity BenchKernelXYN
BENCH_FP32_CPU
(
kVAddBias
)
{
BenchAXYNKernel
<
jit
::
kVAddBias
,
T
,
CPUPlace
>
();
}
#define BenchKernelVSquare BenchKernelXYN
#define BenchKernelVExp BenchKernelXYN
#define BenchKernelVSigmoid BenchKernelXYN
#define BenchKernelVTanh BenchKernelXYN
#define BenchKernelVCopy BenchKernelXYN
// xrn
#define BenchKernelHMax BenchKernelXRN
BENCH_FP32_CPU
(
kHSum
)
{
BenchXRNKernel
<
jit
::
kHSum
,
T
,
CPUPlace
>
();
}
#define BenchKernelHSum BenchKernelXRN
BENCH_FP32_CPU
(
kHMax
)
{
BenchXRNKernel
<
jit
::
kHMax
,
T
,
CPUPlace
>
();
}
// xyn
#define BenchKernelLSTMCtHt BenchKernelLSTM
BENCH_FP32_CPU
(
kVRelu
)
{
BenchXYNKernel
<
jit
::
kVRelu
,
T
,
CPUPlace
>
();
}
#define BenchKernelLSTMC1H1 BenchKernelLSTM
BENCH_FP32_CPU
(
kVIdentity
)
{
BenchXYNKernel
<
jit
::
kVIdentity
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVSquare
)
{
BenchXYNKernel
<
jit
::
kVSquare
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVExp
)
{
BenchXYNKernel
<
jit
::
kVExp
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVSigmoid
)
{
BenchXYNKernel
<
jit
::
kVSigmoid
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVTanh
)
{
BenchXYNKernel
<
jit
::
kVTanh
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kVCopy
)
{
BenchXYNKernel
<
jit
::
kVCopy
,
T
,
CPUPlace
>
();
}
// lstm and peephole
BENCH_FP32_CPU
(
kLSTMCtHt
)
{
BenchLSTMKernel
<
jit
::
kLSTMCtHt
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kLSTMC1H1
)
{
BenchLSTMKernel
<
jit
::
kLSTMC1H1
,
T
,
CPUPlace
>
();
}
// gru functions
BENCH_FP32_CPU
(
kGRUH1
)
{
BenchGRUKernel
<
jit
::
kGRUH1
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kGRUHtPart1
)
{
BenchGRUKernel
<
jit
::
kGRUHtPart1
,
T
,
CPUPlace
>
();
}
BENCH_FP32_CPU
(
kGRUHtPart2
)
{
BenchGRUKernel
<
jit
::
kGRUHtPart2
,
T
,
CPUPlace
>
();
}
// seq pool function
BENCH_FP32_CPU
(
kSeqPool
)
{
BenchSeqPoolKernel
<
jit
::
kSeqPool
,
T
,
CPUPlace
>
();
}
// embedding seq pool function
BENCH_FP32_CPU
(
kEmbSeqPool
)
{
BenchEmbSeqPoolKernel
<
jit
::
kEmbSeqPool
,
T
,
CPUPlace
>
();
}
// sgd function
#define BenchKernelGRUH1 BenchKernelGRU
BENCH_FP32_CPU
(
kSgd
)
{
BenchSgdKernel
<
jit
::
kSgd
,
T
,
CPUPlace
>
();
}
#define BenchKernelGRUHtPart1 BenchKernelGRU
#define BenchKernelGRUHtPart2 BenchKernelGRU
// matmul
using
CPUPlace
=
paddle
::
platform
::
CPUPlace
;
BENCH_FP32_CPU
(
kMatMul
)
{
BenchMatMulKernel
<
jit
::
kMatMul
,
T
,
CPUPlace
>
();
}
// softmax
#define BENCH_FP32_CPU(name) \
BENCH_FP32_CPU
(
kSoftmax
)
{
BenchSoftmaxKernel
<
jit
::
kSoftmax
,
T
,
CPUPlace
>
();
}
BENCH_JITKERNEL(name, FP32, CPU) { \
BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
}
// layernorm
// xyzn
BENCH_FP32_CPU
(
kLayerNorm
)
{
BENCH_FP32_CPU
(
VMul
);
BenchLayerNormKernel
<
jit
::
kLayerNorm
,
T
,
CPUPlace
>
();
BENCH_FP32_CPU
(
VAdd
);
}
BENCH_FP32_CPU
(
VAddRelu
);
BENCH_FP32_CPU
(
VSub
);
// crfdecoding
// axyn
BENCH_FP32_CPU
(
kCRFDecoding
)
{
BENCH_FP32_CPU
(
VScal
);
BenchCRFDecodingKernel
<
jit
::
kCRFDecoding
,
T
,
CPUPlace
>
();
BENCH_FP32_CPU
(
VAddBias
);
}
// vbroadcast function
// xyn
BENCH_FP32_CPU
(
kVBroadcast
)
{
BENCH_FP32_CPU
(
VRelu
);
BenchVBroadcastKernel
<
jit
::
kVBroadcast
,
T
,
CPUPlace
>
();
BENCH_FP32_CPU
(
VIdentity
);
}
BENCH_FP32_CPU
(
VSquare
);
BENCH_FP32_CPU
(
VExp
);
BENCH_FP32_CPU
(
VSigmoid
);
BENCH_FP32_CPU
(
VTanh
);
BENCH_FP32_CPU
(
VCopy
);
// xrn
BENCH_FP32_CPU
(
HMax
);
BENCH_FP32_CPU
(
HSum
);
// LSTM
BENCH_FP32_CPU
(
LSTMCtHt
);
BENCH_FP32_CPU
(
LSTMC1H1
);
// GRU
BENCH_FP32_CPU
(
GRUH1
);
BENCH_FP32_CPU
(
GRUHtPart1
);
BENCH_FP32_CPU
(
GRUHtPart2
);
BENCH_FP32_CPU
(
LayerNorm
);
BENCH_FP32_CPU
(
CRFDecoding
);
BENCH_FP32_CPU
(
SeqPool
);
BENCH_FP32_CPU
(
EmbSeqPool
);
BENCH_FP32_CPU
(
MatMul
);
BENCH_FP32_CPU
(
Softmax
);
BENCH_FP32_CPU
(
Sgd
);
BENCH_FP32_CPU
(
VBroadcast
);
// Benchmark all jit kernels including jitcode, mkl and refer.
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// To use this tool, run command: ./benchmark [options...]
...
...
paddle/fluid/operators/jit/gen/act.cc
浏览文件 @
ccc7c358
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
* limitations under the License. */
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/gen/act.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -81,7 +82,7 @@ void VActJitCode::genCode() {
...
@@ -81,7 +82,7 @@ void VActJitCode::genCode() {
#define DECLARE_ACT_CREATOR(name) \
#define DECLARE_ACT_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
class name##Creator : public JitCodeCreator<int> { \
public: \
public: \
bool
UseMe(const int& attr) const override;
\
bool
CanBeUsed(const int& attr) const override;
\
size_t CodeSize(const int& d) const override; \
size_t CodeSize(const int& d) const override; \
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
...
@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
...
@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
DECLARE_ACT_CREATOR
(
VTanh
);
DECLARE_ACT_CREATOR
(
VTanh
);
// TODO(TJ): tuning use me
// TODO(TJ): tuning use me
bool
VReluCreator
::
UseMe
(
const
int
&
d
)
const
{
bool
VReluCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
bool
VSquareCreator
::
UseMe
(
const
int
&
d
)
const
{
bool
VSquareCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
bool
VIdentityCreator
::
UseMe
(
const
int
&
d
)
const
{
bool
VIdentityCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
bool
VExpCreator
::
UseMe
(
const
int
&
d
)
const
{
bool
VExpCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
<
32
;
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
<
32
;
}
}
bool
VSigmoidCreator
::
UseMe
(
const
int
&
d
)
const
{
bool
VSigmoidCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
bool
VTanhCreator
::
UseMe
(
const
int
&
d
)
const
{
bool
VTanhCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
...
...
paddle/fluid/operators/jit/gen/blas.cc
浏览文件 @
ccc7c358
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
* limitations under the License. */
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/blas.h"
#include "paddle/fluid/operators/jit/gen/blas.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {
...
@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {
class
NCHW16CMulNCCreator
:
public
JitCodeCreator
<
int
>
{
class
NCHW16CMulNCCreator
:
public
JitCodeCreator
<
int
>
{
public:
public:
bool
UseMe
(
const
int
&
attr
)
const
override
{
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx512f
);
return
platform
::
MayIUse
(
platform
::
avx512f
);
}
}
size_t
CodeSize
(
const
int
&
d
)
const
override
{
return
256
*
1024
;
}
size_t
CodeSize
(
const
int
&
d
)
const
override
{
return
256
*
1024
;
}
...
@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
...
@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
#define DECLARE_BLAS_CREATOR(name) \
#define DECLARE_BLAS_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
class name##Creator : public JitCodeCreator<int> { \
public: \
public: \
bool
UseMe(const int& attr) const override {
\
bool
CanBeUsed(const int& attr) const override {
\
return platform::MayIUse(platform::avx) && attr <= 1024; \
return platform::MayIUse(platform::avx) && attr <= 1024; \
} \
} \
size_t CodeSize(const int& d) const override { \
size_t CodeSize(const int& d) const override { \
...
...
paddle/fluid/operators/jit/gen/embseqpool.cc
浏览文件 @
ccc7c358
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/embseqpool.h"
#include "paddle/fluid/operators/jit/gen/embseqpool.h"
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include <memory>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
...
@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {
...
@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {
class
EmbSeqPoolCreator
:
public
JitCodeCreator
<
emb_seq_pool_attr_t
>
{
class
EmbSeqPoolCreator
:
public
JitCodeCreator
<
emb_seq_pool_attr_t
>
{
public:
public:
bool
UseMe
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
return
platform
::
MayIUse
(
platform
::
avx
)
&&
attr
.
table_width
%
YMM_FLOAT_BLOCK
==
0
;
attr
.
table_width
%
YMM_FLOAT_BLOCK
==
0
;
}
}
...
...
paddle/fluid/operators/jit/gen/gru.cc
浏览文件 @
ccc7c358
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/gru.h"
#include "paddle/fluid/operators/jit/gen/gru.h"
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
...
@@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
class name##Creator : public JitCodeCreator<gru_attr_t> { \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
public: \
/* TODO(TJ): enable more */
\
/* TODO(TJ): enable more */
\
bool
UseMe(const gru_attr_t& attr) const override {
\
bool
CanBeUsed(const gru_attr_t& attr) const override {
\
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
} \
size_t CodeSize(const gru_attr_t& attr) const override { \
size_t CodeSize(const gru_attr_t& attr) const override { \
...
...
paddle/fluid/operators/jit/gen/hopv.cc
浏览文件 @
ccc7c358
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
* limitations under the License. */
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
...
@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
#define DECLARE_HOP_CREATOR(name) \
#define DECLARE_HOP_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
class name##Creator : public JitCodeCreator<int> { \
public: \
public: \
bool
UseMe(const int& attr) const override {
\
bool
CanBeUsed(const int& attr) const override {
\
return platform::MayIUse(platform::avx); \
return platform::MayIUse(platform::avx); \
} \
} \
size_t CodeSize(const int& d) const override { \
size_t CodeSize(const int& d) const override { \
...
...
paddle/fluid/operators/jit/gen/jitcode.h
浏览文件 @
ccc7c358
...
@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
...
@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
virtual
void
genCode
()
=
0
;
virtual
void
genCode
()
=
0
;
size_t
getSize
()
const
override
{
return
CodeGenerator
::
getSize
();
}
size_t
getSize
()
const
override
{
return
CodeGenerator
::
getSize
();
}
const
unsigned
char
*
getCodeInternal
()
override
{
const
unsigned
char
*
getCodeInternal
()
const
override
{
const
Xbyak
::
uint8
*
code
=
CodeGenerator
::
getCode
();
const
Xbyak
::
uint8
*
code
=
CodeGenerator
::
getCode
();
return
code
;
return
code
;
}
}
...
...
paddle/fluid/operators/jit/gen/lstm.cc
浏览文件 @
ccc7c358
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/lstm.h"
#include "paddle/fluid/operators/jit/gen/lstm.h"
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
...
@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
public: \
/* TODO(TJ): enable more */
\
/* TODO(TJ): enable more */
\
bool
UseMe(const lstm_attr_t& attr) const override {
\
bool
CanBeUsed(const lstm_attr_t& attr) const override {
\
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
} \
size_t CodeSize(const lstm_attr_t& attr) const override { \
size_t CodeSize(const lstm_attr_t& attr) const override { \
...
...
paddle/fluid/operators/jit/gen/matmul.cc
浏览文件 @
ccc7c358
...
@@ -14,8 +14,8 @@
...
@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/jit/gen/matmul.h"
#include "paddle/fluid/operators/jit/gen/matmul.h"
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include <memory>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {
...
@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {
class
MatMulCreator
:
public
JitCodeCreator
<
matmul_attr_t
>
{
class
MatMulCreator
:
public
JitCodeCreator
<
matmul_attr_t
>
{
public:
public:
bool
UseMe
(
const
matmul_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
override
{
return
attr
.
m
==
1
&&
platform
::
MayIUse
(
platform
::
avx512f
)
&&
return
attr
.
m
==
1
&&
platform
::
MayIUse
(
platform
::
avx512f
)
&&
attr
.
n
%
ZMM_FLOAT_BLOCK
==
0
&&
attr
.
k
<
512
;
attr
.
n
%
ZMM_FLOAT_BLOCK
==
0
&&
attr
.
k
<
512
;
}
}
...
...
paddle/fluid/operators/jit/gen/seqpool.cc
浏览文件 @
ccc7c358
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
* limitations under the License. */
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include <memory>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {
...
@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
public:
public:
bool
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
...
...
paddle/fluid/operators/jit/gen/sgd.cc
浏览文件 @
ccc7c358
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/sgd.h"
#include "paddle/fluid/operators/jit/gen/sgd.h"
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include <memory>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -104,7 +105,7 @@ void SgdJitCode::genCode() {
...
@@ -104,7 +105,7 @@ void SgdJitCode::genCode() {
class
SgdCreator
:
public
JitCodeCreator
<
sgd_attr_t
>
{
class
SgdCreator
:
public
JitCodeCreator
<
sgd_attr_t
>
{
public:
public:
bool
UseMe
(
const
sgd_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
sgd_attr_t
&
attr
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
return
platform
::
MayIUse
(
platform
::
avx
)
&&
attr
.
grad_width
%
YMM_FLOAT_BLOCK
==
0
;
attr
.
grad_width
%
YMM_FLOAT_BLOCK
==
0
;
}
}
...
...
paddle/fluid/operators/jit/gen/vbroadcast.cc
浏览文件 @
ccc7c358
...
@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {
...
@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
public:
public:
bool
UseMe
(
const
int64_t
&
w
)
const
override
{
bool
CanBeUsed
(
const
int64_t
&
w
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
return
platform
::
MayIUse
(
platform
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
}
}
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
...
...
paddle/fluid/operators/jit/gen_base.cc
浏览文件 @
ccc7c358
...
@@ -31,7 +31,7 @@ namespace paddle {
...
@@ -31,7 +31,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
jit
{
namespace
jit
{
// refer do not need
useme
, it would be the last one.
// refer do not need
CanBeUsed
, it would be the last one.
void
GenBase
::
dumpCode
(
const
unsigned
char
*
code
)
const
{
void
GenBase
::
dumpCode
(
const
unsigned
char
*
code
)
const
{
if
(
code
)
{
if
(
code
)
{
static
int
counter
=
0
;
static
int
counter
=
0
;
...
...
paddle/fluid/operators/jit/gen_base.h
浏览文件 @
ccc7c358
...
@@ -31,9 +31,10 @@ class GenBase : public Kernel {
...
@@ -31,9 +31,10 @@ class GenBase : public Kernel {
virtual
~
GenBase
()
=
default
;
virtual
~
GenBase
()
=
default
;
virtual
std
::
string
name
()
const
=
0
;
virtual
std
::
string
name
()
const
=
0
;
virtual
size_t
getSize
()
const
=
0
;
virtual
size_t
getSize
()
const
=
0
;
virtual
const
unsigned
char
*
getCodeInternal
()
=
0
;
virtual
const
unsigned
char
*
getCodeInternal
()
const
=
0
;
const
char
*
ImplType
()
const
override
{
return
"JitCode"
;
}
template
<
typename
Func
>
template
<
typename
Func
>
Func
getCode
()
{
Func
getCode
()
const
{
const
unsigned
char
*
code
=
this
->
getCodeInternal
();
const
unsigned
char
*
code
=
this
->
getCodeInternal
();
if
(
FLAGS_dump_jitcode
)
{
if
(
FLAGS_dump_jitcode
)
{
this
->
dumpCode
(
code
);
this
->
dumpCode
(
code
);
...
@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
...
@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
virtual
~
JitCodeCreator
()
=
default
;
virtual
~
JitCodeCreator
()
=
default
;
// condition when this jit code can be used.
// condition when this jit code can be used.
virtual
bool
UseMe
(
const
Attr
&
attr
)
const
=
0
;
virtual
bool
CanBeUsed
(
const
Attr
&
attr
)
const
=
0
;
// estimate this code size
// estimate this code size
virtual
size_t
CodeSize
(
const
Attr
&
attr
)
const
=
0
;
virtual
size_t
CodeSize
(
const
Attr
&
attr
)
const
=
0
;
...
...
paddle/fluid/operators/jit/helper.h
浏览文件 @
ccc7c358
...
@@ -16,6 +16,8 @@
...
@@ -16,6 +16,8 @@
#include <iostream>
#include <iostream>
#include <string>
#include <string>
#include <unordered_map>
#include <utility> // for std::move
#include <vector>
#include <vector>
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
...
@@ -27,35 +29,34 @@ namespace paddle {
...
@@ -27,35 +29,34 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
jit
{
namespace
jit
{
template
<
KernelType
KT
,
typename
KernelTuples
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
inline
typename
std
::
enable_if
<
inline
typename
std
::
enable_if
<
std
::
is_same
<
typename
KernelTuple
s
::
data_type
,
float
>::
value
&&
std
::
is_same
<
typename
KernelTuple
::
data_type
,
float
>::
value
&&
std
::
is_same
<
PlaceType
,
platform
::
CPUPlace
>::
value
,
std
::
is_same
<
PlaceType
,
platform
::
CPUPlace
>::
value
,
typename
KernelTuples
::
func_type
>::
type
const
Kernel
*>::
type
GetJitCode
(
const
typename
KernelTuples
::
attr_type
&
attr
)
{
GetJitCode
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
using
Func
=
typename
KernelTuples
::
func_type
;
using
Attr
=
typename
KernelTuple
::
attr_type
;
using
Attr
=
typename
KernelTuples
::
attr_type
;
int64_t
key
=
JitCodeKey
<
Attr
>
(
attr
);
size_t
key
=
JitCodeKey
<
Attr
>
(
attr
);
auto
&
codes
=
JitCodePool
<
KernelTuple
::
kernel_type
>::
Instance
();
auto
&
codes
=
JitCodePool
<
KT
>
().
Instance
();
if
(
codes
.
Has
(
key
))
{
if
(
codes
.
Has
(
key
))
{
return
codes
.
AllKernels
().
at
(
key
)
->
template
getCode
<
Func
>
();
return
codes
.
AllKernels
().
at
(
key
)
.
get
();
}
}
// creator is not related with attr, so can use KernelKey as key
// creator is not related with attr, so can use KernelKey as key
KernelKey
kkey
(
K
T
,
PlaceType
());
KernelKey
kkey
(
K
ernelTuple
::
kernel_type
,
PlaceType
());
// pool: (KernelKey(type, place), vector<GenCreatorPtr>)
// pool: (KernelKey(type, place), vector<GenCreatorPtr>)
auto
&
creator_map
=
JitCodeCreatorPool
().
Instance
().
AllCreators
();
auto
&
creator_map
=
JitCodeCreatorPool
::
Instance
().
AllCreators
();
auto
iter
=
creator_map
.
find
(
kkey
);
auto
iter
=
creator_map
.
find
(
kkey
);
if
(
iter
!=
creator_map
.
end
())
{
if
(
iter
!=
creator_map
.
end
())
{
auto
&
creators
=
iter
->
second
;
auto
&
creators
=
iter
->
second
;
for
(
auto
&
cur
:
creators
)
{
for
(
auto
&
cur
:
creators
)
{
auto
i
=
dynamic_cast
<
const
JitCodeCreator
<
Attr
>*>
(
cur
.
get
());
auto
i
=
dynamic_cast
<
const
JitCodeCreator
<
Attr
>*>
(
cur
.
get
());
if
(
i
&&
i
->
UseMe
(
attr
))
{
if
(
i
&&
i
->
CanBeUsed
(
attr
))
{
auto
p
=
i
->
CreateJitCode
(
attr
);
auto
p
=
i
->
CreateJitCode
(
attr
);
if
(
p
)
{
if
(
p
)
{
auto
f
=
p
->
template
getCode
<
Func
>
();
auto
res
=
p
.
get
();
codes
.
Insert
(
key
,
std
::
move
(
p
));
codes
.
Insert
(
key
,
std
::
move
(
p
));
return
f
;
return
res
;
}
}
}
}
}
}
...
@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
...
@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
return
nullptr
;
return
nullptr
;
}
}
template
<
KernelType
KT
,
typename
KernelTuples
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
inline
typename
std
::
enable_if
<
inline
typename
std
::
enable_if
<
!
std
::
is_same
<
typename
KernelTuple
s
::
data_type
,
float
>::
value
||
!
std
::
is_same
<
typename
KernelTuple
::
data_type
,
float
>::
value
||
!
std
::
is_same
<
PlaceType
,
platform
::
CPUPlace
>::
value
,
!
std
::
is_same
<
PlaceType
,
platform
::
CPUPlace
>::
value
,
typename
KernelTuples
::
func_type
>::
type
const
Kernel
*
>::
type
GetJitCode
(
const
typename
KernelTuple
s
::
attr_type
&
attr
)
{
GetJitCode
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
return
nullptr
;
return
nullptr
;
}
}
// Refer code do not related with attr, which is just for cast
// Refer code do not related with attr, which is just for cast
// Refer is always on CPUPlace
// Refer is always on CPUPlace
template
<
KernelType
KT
,
typename
KernelTuples
>
template
<
typename
KernelTuple
>
inline
typename
KernelTuples
::
func_type
GetRefer
()
{
inline
const
Kernel
*
GetReferKernel
()
{
auto
&
ref_pool
=
ReferKernelPool
().
Instance
().
AllKernels
();
auto
&
ref_pool
=
ReferKernelPool
::
Instance
().
AllKernels
();
KernelKey
kkey
(
K
T
,
platform
::
CPUPlace
());
KernelKey
kkey
(
K
ernelTuple
::
kernel_type
,
platform
::
CPUPlace
());
auto
ref_iter
=
ref_pool
.
find
(
kkey
);
auto
ref_iter
=
ref_pool
.
find
(
kkey
);
PADDLE_ENFORCE
(
ref_iter
!=
ref_pool
.
end
(),
PADDLE_ENFORCE
(
ref_iter
!=
ref_pool
.
end
(),
"Every Kernel should have reference function."
);
"Every Kernel should have reference function."
);
auto
&
ref_impls
=
ref_iter
->
second
;
auto
&
ref_impls
=
ref_iter
->
second
;
for
(
auto
&
impl
:
ref_impls
)
{
for
(
auto
&
impl
:
ref_impls
)
{
auto
i
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
s
>*>
(
impl
.
get
());
auto
i
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
impl
.
get
());
if
(
i
)
{
if
(
i
)
{
return
i
->
GetFunc
()
;
return
i
;
}
}
}
}
return
nullptr
;
return
nullptr
;
}
}
template
<
KernelType
KT
,
typename
KernelTuples
,
template
<
typename
KernelTuple
>
typename
PlaceType
=
platform
::
CPUPlace
>
inline
typename
KernelTuple
::
func_type
GetReferFunc
()
{
typename
KernelTuples
::
func_type
Get
(
auto
ker
=
GetReferKernel
<
KernelTuple
>
();
const
typename
KernelTuples
::
attr_type
&
attr
)
{
auto
p
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
ker
);
auto
jitfunc
=
GetJitCode
<
KT
,
KernelTuples
,
PlaceType
>
(
attr
);
PADDLE_ENFORCE
(
p
,
"The Refer kernel should exsit"
);
if
(
jitfunc
)
{
return
p
->
GetFunc
();
return
jitfunc
;
}
// Return all Kernels that can be used
template
<
typename
KernelTuple
,
typename
PlaceType
>
std
::
vector
<
const
Kernel
*>
GetAllCandidateKernels
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
// the search order shoudl be jitcode > more > refer
std
::
vector
<
const
Kernel
*>
res
;
auto
jitker
=
GetJitCode
<
KernelTuple
,
PlaceType
>
(
attr
);
if
(
jitker
)
{
res
.
emplace_back
(
jitker
);
}
}
// pool: (KernelKey(type, place), vector<KernelPtr>)
//
more kernel
pool: (KernelKey(type, place), vector<KernelPtr>)
KernelKey
kkey
(
K
T
,
PlaceType
());
KernelKey
kkey
(
K
ernelTuple
::
kernel_type
,
PlaceType
());
auto
&
pool
=
KernelPool
().
Instance
().
AllKernels
();
auto
&
pool
=
KernelPool
::
Instance
().
AllKernels
();
auto
iter
=
pool
.
find
(
kkey
);
auto
iter
=
pool
.
find
(
kkey
);
if
(
iter
!=
pool
.
end
())
{
if
(
iter
!=
pool
.
end
())
{
auto
&
impls
=
iter
->
second
;
auto
&
impls
=
iter
->
second
;
for
(
auto
&
impl
:
impls
)
{
for
(
auto
&
impl
:
impls
)
{
auto
i
=
dynamic_cast
<
const
KernelMore
<
KernelTuple
s
>*>
(
impl
.
get
());
auto
i
=
dynamic_cast
<
const
KernelMore
<
KernelTuple
>*>
(
impl
.
get
());
if
(
i
&&
i
->
UseMe
(
attr
))
{
if
(
i
&&
i
->
CanBeUsed
(
attr
))
{
re
turn
i
->
GetFunc
(
);
re
s
.
emplace_back
(
i
);
}
}
}
}
}
}
// The last implementation should be reference function on CPUPlace.
// The last implementation should be reference function on CPUPlace.
return
GetRefer
<
KT
,
KernelTuples
>
();
auto
ref
=
GetReferKernel
<
KernelTuple
>
();
PADDLE_ENFORCE
(
ref
!=
nullptr
,
"Refer Kernel can not be empty."
);
res
.
emplace_back
(
ref
);
return
res
;
}
template
<
typename
KernelTuple
,
typename
PlaceType
=
platform
::
CPUPlace
>
std
::
vector
<
std
::
pair
<
std
::
string
,
typename
KernelTuple
::
func_type
>>
GetAllCandidateFuncsWithTypes
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
using
Func
=
typename
KernelTuple
::
func_type
;
auto
kers
=
GetAllCandidateKernels
<
KernelTuple
,
PlaceType
>
(
attr
);
std
::
vector
<
std
::
pair
<
std
::
string
,
Func
>>
res
;
for
(
auto
k
:
kers
)
{
std
::
string
name
=
k
->
ImplType
();
if
(
name
==
"JitCode"
)
{
auto
i
=
dynamic_cast
<
const
GenBase
*>
(
k
);
PADDLE_ENFORCE
(
i
,
"jitcode kernel cast can not fail."
);
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
template
getCode
<
Func
>()));
}
else
{
auto
i
=
dynamic_cast
<
const
KernelMore
<
KernelTuple
>*>
(
k
);
PADDLE_ENFORCE
(
i
,
"kernel cast can not fail."
);
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
GetFunc
()));
}
}
return
res
;
}
template
<
typename
KernelTuple
,
typename
PlaceType
=
platform
::
CPUPlace
>
std
::
vector
<
typename
KernelTuple
::
func_type
>
GetAllCandidateFuncs
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
auto
funcs
=
GetAllCandidateFuncsWithTypes
<
KernelTuple
,
PlaceType
>
(
attr
);
std
::
vector
<
typename
KernelTuple
::
func_type
>
res
;
for
(
auto
&
i
:
funcs
)
{
res
.
emplace_back
(
i
.
second
);
}
return
res
;
}
template
<
typename
KernelTuple
,
typename
PlaceType
=
platform
::
CPUPlace
>
typename
KernelTuple
::
func_type
GetDefaultBestFunc
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
auto
funcs
=
GetAllCandidateFuncs
<
KernelTuple
,
PlaceType
>
(
attr
);
PADDLE_ENFORCE_GE
(
funcs
.
size
(),
1UL
);
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
return
funcs
[
0
];
}
}
template
<
KernelType
KT
,
typename
KernelTuples
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
class
KernelFuncs
{
class
KernelFuncs
{
public:
public:
KernelFuncs
()
=
default
;
KernelFuncs
()
=
default
;
static
KernelFuncs
&
Cache
()
{
static
KernelFuncs
&
Cache
()
{
static
thread_local
KernelFuncs
<
K
T
,
KernelTuples
,
PlaceType
>
g_func_cache
;
static
thread_local
KernelFuncs
<
K
ernelTuple
,
PlaceType
>
g_func_cache
;
return
g_func_cache
;
return
g_func_cache
;
}
}
bool
Has
(
int
key
)
const
{
return
funcs_
.
find
(
key
)
!=
funcs_
.
end
();
}
// the exposed interface to use
typename
KernelTuple
::
func_type
At
(
void
Insert
(
int
key
,
typename
KernelTuples
::
func_type
func
)
{
const
typename
KernelTuple
::
attr_type
&
attr
)
{
funcs_
.
emplace
(
key
,
func
);
// Maybe here is not good enough, not all kernels should have jitcode
}
int64_t
key
=
JitCodeKey
<
typename
KernelTuple
::
attr_type
>
(
attr
);
typename
KernelTuples
::
func_type
At
(
int
key
)
{
if
(
Has
(
key
))
{
if
(
Has
(
key
))
{
return
funcs_
.
at
(
key
);
return
funcs_
.
at
(
key
);
}
}
auto
func
=
Get
<
KT
,
KernelTuples
,
PlaceType
>
(
key
);
// If do not have this attr in cache then get the default best
auto
func
=
GetDefaultBestFunc
<
KernelTuple
,
PlaceType
>
(
attr
);
Insert
(
key
,
func
);
Insert
(
key
,
func
);
return
func
;
return
func
;
}
}
typename
KernelTuple
::
func_type
operator
[](
const
typename
KernelTuple
::
attr_type
&
attr
)
{
return
At
(
attr
);
}
protected:
bool
Has
(
int64_t
key
)
const
{
return
funcs_
.
find
(
key
)
!=
funcs_
.
end
();
}
void
Insert
(
int64_t
key
,
typename
KernelTuple
::
func_type
func
)
{
funcs_
.
emplace
(
key
,
func
);
}
private:
private:
std
::
unordered_map
<
int
,
typename
KernelTuples
::
func_type
>
funcs_
;
std
::
unordered_map
<
int
64_t
,
typename
KernelTuple
::
func_type
>
funcs_
;
DISABLE_COPY_AND_ASSIGN
(
KernelFuncs
);
DISABLE_COPY_AND_ASSIGN
(
KernelFuncs
);
};
};
...
...
paddle/fluid/operators/jit/kernel_base.h
浏览文件 @
ccc7c358
...
@@ -62,26 +62,55 @@ typedef enum {
...
@@ -62,26 +62,55 @@ typedef enum {
kSqrt
,
kSqrt
,
}
SeqPoolType
;
}
SeqPoolType
;
// x, y, z, n
template
<
typename
T
>
template
<
typename
T
>
struct
XYZNTuple
s
{
struct
XYZNTuple
{
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int
attr_type
;
typedef
int
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
T
*
,
int
);
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
T
*
,
int
);
};
};
// a, x, y, n
template
<
typename
T
>
template
<
typename
T
>
struct
AXYNTuple
s
:
public
XYZNTuples
<
T
>
{};
struct
AXYNTuple
:
public
XYZNTuple
<
T
>
{};
// x, y, n
template
<
typename
T
>
template
<
typename
T
>
struct
XYNTuple
s
{
struct
XYNTuple
{
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int
attr_type
;
typedef
int
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int
);
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int
);
};
};
// x, return
and int
// x, return
ed value, n
template
<
typename
T
>
template
<
typename
T
>
struct
XRNTuples
:
public
XYNTuples
<
T
>
{};
struct
XRNTuple
:
public
XYNTuple
<
T
>
{};
#define DECLARE_KERNELTUPLE(kernel_tuple, type) \
template <typename T> \
struct type##Tuple : public kernel_tuple<T> { \
static constexpr KernelType kernel_type = k##type; \
}
// Tuple should be corresponding to the KernelType
DECLARE_KERNELTUPLE
(
XYZNTuple
,
VMul
);
DECLARE_KERNELTUPLE
(
XYZNTuple
,
VAdd
);
DECLARE_KERNELTUPLE
(
XYZNTuple
,
VAddRelu
);
DECLARE_KERNELTUPLE
(
XYZNTuple
,
VSub
);
DECLARE_KERNELTUPLE
(
AXYNTuple
,
VScal
);
DECLARE_KERNELTUPLE
(
AXYNTuple
,
VAddBias
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VRelu
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VIdentity
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VSquare
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VExp
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VSigmoid
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VTanh
);
DECLARE_KERNELTUPLE
(
XYNTuple
,
VCopy
);
DECLARE_KERNELTUPLE
(
XRNTuple
,
HMax
);
DECLARE_KERNELTUPLE
(
XRNTuple
,
HSum
);
typedef
struct
{
typedef
struct
{
void
*
gates
;
// gates: x_ch, x_ih, x_fh, x_oh
void
*
gates
;
// gates: x_ch, x_ih, x_fh, x_oh
...
@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
...
@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
typedef
struct
lstm_attr_s
lstm_attr_t
;
typedef
struct
lstm_attr_s
lstm_attr_t
;
template
<
typename
T
>
template
<
typename
T
>
struct
LSTMTuple
s
{
struct
LSTMTuple
{
typedef
T
data_type
;
typedef
T
data_type
;
typedef
lstm_attr_t
attr_type
;
typedef
lstm_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
lstm_t
*
,
const
lstm_attr_t
*
);
typedef
void
(
*
func_type
)(
lstm_t
*
,
const
lstm_attr_t
*
);
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
GRUTuple
s
{
struct
GRUTuple
{
typedef
T
data_type
;
typedef
T
data_type
;
typedef
gru_attr_t
attr_type
;
typedef
gru_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
};
};
DECLARE_KERNELTUPLE
(
LSTMTuple
,
LSTMCtHt
);
DECLARE_KERNELTUPLE
(
LSTMTuple
,
LSTMC1H1
);
DECLARE_KERNELTUPLE
(
GRUTuple
,
GRUH1
);
DECLARE_KERNELTUPLE
(
GRUTuple
,
GRUHtPart1
);
DECLARE_KERNELTUPLE
(
GRUTuple
,
GRUHtPart2
);
#undef DECLARE_KERNELTUPLE
template
<
typename
T
>
template
<
typename
T
>
struct
VBroadcastTuples
{
struct
VBroadcastTuple
{
static
constexpr
KernelType
kernel_type
=
kVBroadcast
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int64_t
attr_type
;
typedef
int64_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int64_t
,
int64_t
);
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int64_t
,
int64_t
);
...
@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
...
@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
}
seq_pool_attr_t
;
}
seq_pool_attr_t
;
template
<
typename
T
>
template
<
typename
T
>
struct
SeqPoolTuples
{
struct
SeqPoolTuple
{
static
constexpr
KernelType
kernel_type
=
kSeqPool
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
seq_pool_attr_t
attr_type
;
typedef
seq_pool_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
const
seq_pool_attr_t
*
);
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
const
seq_pool_attr_t
*
);
...
@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
...
@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
}
emb_seq_pool_attr_t
;
}
emb_seq_pool_attr_t
;
template
<
typename
T
>
template
<
typename
T
>
struct
EmbSeqPoolTuples
{
struct
EmbSeqPoolTuple
{
static
constexpr
KernelType
kernel_type
=
kEmbSeqPool
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
emb_seq_pool_attr_t
attr_type
;
typedef
emb_seq_pool_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
const
int64_t
*
,
T
*
,
typedef
void
(
*
func_type
)(
const
T
*
,
const
int64_t
*
,
T
*
,
...
@@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
...
@@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
}
sgd_attr_t
;
}
sgd_attr_t
;
template
<
typename
T
>
template
<
typename
T
>
struct
SgdTuples
{
struct
SgdTuple
{
static
constexpr
KernelType
kernel_type
=
kSgd
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
sgd_attr_t
attr_type
;
typedef
sgd_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
const
T
*
,
const
int64_t
*
,
T
*
,
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
const
T
*
,
const
int64_t
*
,
T
*
,
...
@@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
...
@@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
}
matmul_attr_t
;
}
matmul_attr_t
;
template
<
typename
T
>
template
<
typename
T
>
struct
MatMulTuples
{
struct
MatMulTuple
{
static
constexpr
KernelType
kernel_type
=
kMatMul
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
matmul_attr_t
attr_type
;
typedef
matmul_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
T
*
,
const
matmul_attr_t
*
);
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
T
*
,
const
matmul_attr_t
*
);
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
CRFDecodingTuples
{
struct
CRFDecodingTuple
{
static
constexpr
KernelType
kernel_type
=
kCRFDecoding
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int
attr_type
;
typedef
int
attr_type
;
typedef
void
(
*
func_type
)(
const
int
,
const
T
*
,
const
T
*
,
T
*
,
int
*
,
int
);
typedef
void
(
*
func_type
)(
const
int
,
const
T
*
,
const
T
*
,
T
*
,
int
*
,
int
);
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
LayerNormTuples
{
struct
LayerNormTuple
{
static
constexpr
KernelType
kernel_type
=
kLayerNorm
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int
attr_type
;
typedef
int
attr_type
;
typedef
void
(
*
func_type
)(
T
*
,
T
*
,
T
*
,
T
*
,
const
T
*
,
const
T
*
,
int
,
typedef
void
(
*
func_type
)(
T
*
,
T
*
,
T
*
,
T
*
,
const
T
*
,
const
T
*
,
int
,
...
@@ -236,7 +281,8 @@ struct LayerNormTuples {
...
@@ -236,7 +281,8 @@ struct LayerNormTuples {
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
SoftmaxTuples
{
struct
SoftmaxTuple
{
static
constexpr
KernelType
kernel_type
=
kSoftmax
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int
attr_type
;
typedef
int
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int
,
int
);
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
int
,
int
);
...
@@ -244,7 +290,8 @@ struct SoftmaxTuples {
...
@@ -244,7 +290,8 @@ struct SoftmaxTuples {
// nChw16c = nChw16c .* NC
// nChw16c = nChw16c .* NC
template
<
typename
T
>
template
<
typename
T
>
struct
NCHW16CMulNCTuples
{
struct
NCHW16CMulNCTuple
{
static
constexpr
KernelType
kernel_type
=
kNCHW16CMulNC
;
typedef
T
data_type
;
typedef
T
data_type
;
typedef
int
attr_type
;
typedef
int
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
T
*
,
int
,
int
);
typedef
void
(
*
func_type
)(
const
T
*
,
const
T
*
,
T
*
,
int
,
int
);
...
@@ -255,28 +302,29 @@ class Kernel {
...
@@ -255,28 +302,29 @@ class Kernel {
public:
public:
Kernel
()
=
default
;
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
virtual
const
char
*
ImplType
()
const
=
0
;
DISABLE_COPY_AND_ASSIGN
(
Kernel
);
DISABLE_COPY_AND_ASSIGN
(
Kernel
);
};
};
template
<
typename
KernelTuple
s
>
template
<
typename
KernelTuple
>
class
KernelMore
:
public
Kernel
{
class
KernelMore
:
public
Kernel
{
public:
public:
using
T
=
typename
KernelTuple
s
::
data_type
;
using
T
=
typename
KernelTuple
::
data_type
;
using
Func
=
typename
KernelTuple
s
::
func_type
;
using
Func
=
typename
KernelTuple
::
func_type
;
using
Attr
=
typename
KernelTuple
s
::
attr_type
;
using
Attr
=
typename
KernelTuple
::
attr_type
;
virtual
Func
GetFunc
()
const
{
return
func
;
}
virtual
Func
GetFunc
()
const
{
return
func
;
}
virtual
bool
UseMe
(
const
Attr
&
attr
)
const
=
0
;
// specify this kernel can be used, means it should not fail if use it.
virtual
const
char
*
ImplType
(
)
const
=
0
;
virtual
bool
CanBeUsed
(
const
Attr
&
attr
)
const
=
0
;
protected:
protected:
Func
func
{
nullptr
};
Func
func
{
nullptr
};
};
};
template
<
typename
KernelTuple
s
>
template
<
typename
KernelTuple
>
class
ReferKernel
:
public
KernelMore
<
KernelTuple
s
>
{
class
ReferKernel
:
public
KernelMore
<
KernelTuple
>
{
public:
public:
// Refer code can always be used
// Refer code can always be used
bool
UseMe
(
const
typename
KernelTuples
::
attr_type
&
attr
)
const
override
{
bool
CanBeUsed
(
const
typename
KernelTuple
::
attr_type
&
attr
)
const
override
{
return
true
;
return
true
;
}
}
const
char
*
ImplType
()
const
override
{
return
"Refer"
;
}
const
char
*
ImplType
()
const
override
{
return
"Refer"
;
}
...
...
paddle/fluid/operators/jit/kernel_key.cc
浏览文件 @
ccc7c358
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
* limitations under the License. */
* limitations under the License. */
#include "paddle/fluid/operators/jit/kernel_key.h"
#include "paddle/fluid/operators/jit/kernel_key.h"
#include <xxhash.h> // XXH64: 13.8 GB/s
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -20,71 +21,46 @@ namespace operators {
...
@@ -20,71 +21,46 @@ namespace operators {
namespace
jit
{
namespace
jit
{
template
<
>
template
<
>
size
_t
JitCodeKey
<
int
>
(
const
int
&
d
)
{
int64
_t
JitCodeKey
<
int
>
(
const
int
&
d
)
{
return
d
;
return
d
;
}
}
template
<
>
template
<
>
size
_t
JitCodeKey
<
int64_t
>
(
const
int64_t
&
d
)
{
int64
_t
JitCodeKey
<
int64_t
>
(
const
int64_t
&
d
)
{
return
d
;
return
d
;
}
}
// TODO(TJ): refine and benchmark JitCodeKey generatation
constexpr
int
act_type_shift
=
3
;
// suppot 2^3 act types
static
inline
int
act_type_convert
(
KernelType
type
)
{
if
(
type
==
kVIdentity
)
{
return
0
;
}
else
if
(
type
==
kVExp
)
{
return
1
;
}
else
if
(
type
==
kVRelu
)
{
return
2
;
}
else
if
(
type
==
kVSigmoid
)
{
return
3
;
}
else
if
(
type
==
kVTanh
)
{
return
4
;
}
PADDLE_THROW
(
"Unsupported act type %d"
,
type
);
return
0
;
}
template
<
>
template
<
>
size_t
JitCodeKey
<
lstm_attr_t
>
(
const
lstm_attr_t
&
attr
)
{
int64_t
JitCodeKey
<
gru_attr_t
>
(
const
gru_attr_t
&
attr
)
{
size_t
key
=
attr
.
d
;
return
XXH64
(
&
attr
,
sizeof
(
gru_attr_t
),
0
);
int
gate_key
=
act_type_convert
(
attr
.
act_gate
)
<<
1
;
int
cand_key
=
act_type_convert
(
attr
.
act_cand
)
<<
(
1
+
act_type_shift
);
int
cell_key
=
act_type_convert
(
attr
.
act_cell
)
<<
(
1
+
act_type_shift
*
2
);
return
(
key
<<
(
1
+
act_type_shift
*
3
))
+
gate_key
+
cand_key
+
cell_key
+
attr
.
use_peephole
;
}
}
template
<
>
template
<
>
size_t
JitCodeKey
<
gru_attr_t
>
(
const
gru_attr_t
&
attr
)
{
int64_t
JitCodeKey
<
lstm_attr_t
>
(
const
lstm_attr_t
&
attr
)
{
size_t
key
=
attr
.
d
;
int
keys
[
5
]
=
{
return
(
key
<<
(
act_type_shift
*
2
))
+
act_type_convert
(
attr
.
act_gate
)
+
attr
.
d
,
static_cast
<
int
>
(
attr
.
act_gate
),
static_cast
<
int
>
(
attr
.
act_cand
),
(
act_type_convert
(
attr
.
act_cand
)
<<
act_type_shift
);
static_cast
<
int
>
(
attr
.
act_cell
),
static_cast
<
int
>
(
attr
.
use_peephole
)};
return
XXH64
(
keys
,
sizeof
(
int
)
*
5
,
0
);
}
}
template
<
>
template
<
>
size_t
JitCodeKey
<
seq_pool_attr_t
>
(
const
seq_pool_attr_t
&
attr
)
{
int64_t
JitCodeKey
<
seq_pool_attr_t
>
(
const
seq_pool_attr_t
&
attr
)
{
size_t
key
=
attr
.
w
;
int
keys
[
2
]
=
{
attr
.
w
,
static_cast
<
int
>
(
attr
.
type
)};
constexpr
int
pool_type_shift
=
3
;
return
XXH64
(
keys
,
sizeof
(
int
)
*
2
,
0
);
return
(
key
<<
pool_type_shift
)
+
static_cast
<
int
>
(
attr
.
type
);
}
}
template
<
>
template
<
>
size_t
JitCodeKey
<
matmul_attr_t
>
(
const
matmul_attr_t
&
attr
)
{
int64_t
JitCodeKey
<
matmul_attr_t
>
(
const
matmul_attr_t
&
attr
)
{
size_t
key
=
attr
.
m
;
return
XXH64
(
&
attr
,
sizeof
(
int
)
*
3
,
0
);
// m, n, k
constexpr
int
shift
=
21
;
return
(
key
<<
shift
*
2
)
+
((
static_cast
<
size_t
>
(
attr
.
n
))
<<
shift
)
+
attr
.
k
;
}
}
template
<
>
template
<
>
size
_t
JitCodeKey
<
emb_seq_pool_attr_t
>
(
const
emb_seq_pool_attr_t
&
attr
)
{
int64
_t
JitCodeKey
<
emb_seq_pool_attr_t
>
(
const
emb_seq_pool_attr_t
&
attr
)
{
return
attr
.
table_width
;
return
attr
.
table_width
;
}
}
template
<
>
template
<
>
size
_t
JitCodeKey
<
sgd_attr_t
>
(
const
sgd_attr_t
&
attr
)
{
int64
_t
JitCodeKey
<
sgd_attr_t
>
(
const
sgd_attr_t
&
attr
)
{
return
attr
.
grad_width
;
return
attr
.
grad_width
;
}
}
...
...
paddle/fluid/operators/jit/kernel_key.h
浏览文件 @
ccc7c358
...
@@ -46,7 +46,7 @@ struct KernelKey {
...
@@ -46,7 +46,7 @@ struct KernelKey {
// Every JitCode should have a method to get the key from attribution
// Every JitCode should have a method to get the key from attribution
template
<
typename
Attr
>
template
<
typename
Attr
>
size
_t
JitCodeKey
(
const
Attr
&
attr
);
int64
_t
JitCodeKey
(
const
Attr
&
attr
);
}
// namespace jit
}
// namespace jit
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/jit/kernel_pool.h
浏览文件 @
ccc7c358
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <memory> // for unique_ptr
#include <memory> // for unique_ptr
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <utility> // for move
#include <vector>
#include <vector>
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
...
@@ -30,7 +31,7 @@ namespace jit {
...
@@ -30,7 +31,7 @@ namespace jit {
template
<
KernelType
KT
>
template
<
KernelType
KT
>
class
JitCodePool
{
class
JitCodePool
{
typedef
std
::
unique_ptr
<
GenBase
>
GenBasePtr
;
typedef
std
::
unique_ptr
<
GenBase
>
GenBasePtr
;
typedef
std
::
unordered_map
<
size
_t
,
GenBasePtr
>
JitCodeMap
;
typedef
std
::
unordered_map
<
int64
_t
,
GenBasePtr
>
JitCodeMap
;
public:
public:
JitCodePool
()
=
default
;
JitCodePool
()
=
default
;
...
@@ -41,9 +42,9 @@ class JitCodePool {
...
@@ -41,9 +42,9 @@ class JitCodePool {
const
JitCodeMap
&
AllKernels
()
{
return
codes_
;
}
const
JitCodeMap
&
AllKernels
()
{
return
codes_
;
}
bool
Has
(
size
_t
key
)
const
{
return
codes_
.
find
(
key
)
!=
codes_
.
end
();
}
bool
Has
(
int64
_t
key
)
const
{
return
codes_
.
find
(
key
)
!=
codes_
.
end
();
}
void
Insert
(
size
_t
key
,
GenBasePtr
value
)
{
void
Insert
(
int64
_t
key
,
GenBasePtr
value
)
{
codes_
.
emplace
(
key
,
std
::
move
(
value
));
codes_
.
emplace
(
key
,
std
::
move
(
value
));
}
}
...
...
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
浏览文件 @
ccc7c358
...
@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
...
@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
}
}
}
}
bool
CRFDecodingKernel
::
UseMe
(
const
int
&
d
)
const
{
bool
CRFDecodingKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
#ifdef __AVX512F__
#ifdef __AVX512F__
constexpr
int
block
=
ZMM_FLOAT_BLOCK
;
constexpr
int
block
=
ZMM_FLOAT_BLOCK
;
#else
#else
...
...
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
浏览文件 @
ccc7c358
...
@@ -26,11 +26,11 @@ namespace intrinsic {
...
@@ -26,11 +26,11 @@ namespace intrinsic {
void
CRFDecoding
(
const
int
seq_len
,
const
float
*
x
,
const
float
*
w
,
void
CRFDecoding
(
const
int
seq_len
,
const
float
*
x
,
const
float
*
w
,
float
*
alpha
,
int
*
track
,
int
tag_num
);
float
*
alpha
,
int
*
track
,
int
tag_num
);
class
CRFDecodingKernel
:
public
KernelMore
<
CRFDecodingTuple
s
<
float
>>
{
class
CRFDecodingKernel
:
public
KernelMore
<
CRFDecodingTuple
<
float
>>
{
public:
public:
CRFDecodingKernel
()
{
this
->
func
=
CRFDecoding
;
}
CRFDecodingKernel
()
{
this
->
func
=
CRFDecoding
;
}
bool
UseMe
(
bool
CanBeUsed
(
const
typename
CRFDecodingTuple
s
<
float
>::
attr_type
&
)
const
override
;
const
typename
CRFDecodingTuple
<
float
>::
attr_type
&
)
const
override
;
const
char
*
ImplType
()
const
override
{
return
"Intrinsic"
;
}
const
char
*
ImplType
()
const
override
{
return
"Intrinsic"
;
}
};
};
...
...
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
浏览文件 @
ccc7c358
...
@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
...
@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
}
}
}
}
bool
LayerNormKernel
::
UseMe
(
const
int
&
d
)
const
{
bool
LayerNormKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
>=
YMM_FLOAT_BLOCK
;
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
>=
YMM_FLOAT_BLOCK
;
}
}
...
...
paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
浏览文件 @
ccc7c358
...
@@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
...
@@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
const
float
*
scale
,
const
float
*
bias
,
int
height
,
const
float
*
scale
,
const
float
*
bias
,
int
height
,
const
float
epsilon
,
int
right
);
const
float
epsilon
,
int
right
);
class
LayerNormKernel
:
public
KernelMore
<
LayerNormTuple
s
<
float
>>
{
class
LayerNormKernel
:
public
KernelMore
<
LayerNormTuple
<
float
>>
{
public:
public:
LayerNormKernel
()
{
this
->
func
=
LayerNorm
;
}
LayerNormKernel
()
{
this
->
func
=
LayerNorm
;
}
bool
UseMe
(
const
typename
LayerNormTuples
<
float
>::
attr_type
&
)
const
override
;
bool
CanBeUsed
(
const
typename
LayerNormTuple
<
float
>::
attr_type
&
)
const
override
;
const
char
*
ImplType
()
const
override
{
return
"Intrinsic"
;
}
const
char
*
ImplType
()
const
override
{
return
"Intrinsic"
;
}
};
};
...
...
paddle/fluid/operators/jit/more/mix/mix.cc
浏览文件 @
ccc7c358
...
@@ -23,6 +23,8 @@ namespace jit {
...
@@ -23,6 +23,8 @@ namespace jit {
namespace
more
{
namespace
more
{
namespace
mix
{
namespace
mix
{
using
CPUPlace
=
platform
::
CPUPlace
;
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
...
@@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) {
...
@@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) {
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
}
auto
compute
=
Get
<
KernelType
::
kVExp
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
n
);
auto
compute
=
KernelFuncs
<
VExpTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
compute
(
y
,
y
,
n
);
compute
(
y
,
y
,
n
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
...
@@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) {
...
@@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) {
void
VTanh
(
const
T
*
x
,
T
*
y
,
int
n
)
{
void
VTanh
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
T
a
=
2
,
b
=
-
1
;
const
T
a
=
2
,
b
=
-
1
;
auto
compute_scal
=
Get
<
kVScal
,
AXYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
n
);
auto
compute_scal
=
KernelFuncs
<
VScalTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_addbias
=
Get
<
kVAddBias
,
AXYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
n
);
auto
compute_addbias
=
KernelFuncs
<
VAddBiasTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_sigmoid
=
Get
<
kVSigmoid
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
n
);
auto
compute_sigmoid
=
KernelFuncs
<
VSigmoidTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
compute_scal
(
&
a
,
x
,
y
,
n
);
compute_scal
(
&
a
,
x
,
y
,
n
);
compute_sigmoid
(
y
,
y
,
n
);
compute_sigmoid
(
y
,
y
,
n
);
compute_scal
(
&
a
,
y
,
y
,
n
);
compute_scal
(
&
a
,
y
,
y
,
n
);
...
@@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) {
...
@@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) {
}
}
void
Softmax
(
const
T
*
x
,
T
*
y
,
int
n
,
int
bs
)
{
void
Softmax
(
const
T
*
x
,
T
*
y
,
int
n
,
int
bs
)
{
auto
compute_hmax
=
auto
compute_hmax
=
KernelFuncs
<
HMaxTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
KernelFuncs
<
kHMax
,
XRNTuples
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_hsum
=
KernelFuncs
<
HSumTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_hsum
=
auto
compute_vscal
=
KernelFuncs
<
VScalTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
KernelFuncs
<
kHSum
,
XRNTuples
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_vscal
=
KernelFuncs
<
kVScal
,
AXYNTuples
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_vaddbias
=
auto
compute_vaddbias
=
KernelFuncs
<
kVAddBias
,
AXYNTuples
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
n
);
KernelFuncs
<
VAddBiasTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
auto
compute_vexp
=
auto
compute_vexp
=
KernelFuncs
<
VExpTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
n
);
KernelFuncs
<
kVExp
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
n
);
for
(
int
i
=
0
;
i
<
bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
bs
;
++
i
)
{
T
scalar
;
T
scalar
;
...
@@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) {
...
@@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) {
void
(
*
getActFunc
(
KernelType
type
,
int
d
))(
const
T
*
,
T
*
,
int
)
{
// NOLINT
void
(
*
getActFunc
(
KernelType
type
,
int
d
))(
const
T
*
,
T
*
,
int
)
{
// NOLINT
if
(
type
==
kVSigmoid
)
{
if
(
type
==
kVSigmoid
)
{
return
Get
<
kVSigmoid
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
return
KernelFuncs
<
VSigmoidTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
}
else
if
(
type
==
kVRelu
)
{
}
else
if
(
type
==
kVRelu
)
{
return
Get
<
kVRelu
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
return
KernelFuncs
<
VReluTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
}
else
if
(
type
==
kVTanh
)
{
}
else
if
(
type
==
kVTanh
)
{
return
Get
<
kVTanh
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
return
KernelFuncs
<
VTanhTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
}
else
if
(
type
==
kVIdentity
)
{
}
else
if
(
type
==
kVIdentity
)
{
return
Get
<
kVIdentity
,
XYNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
return
KernelFuncs
<
VIdentityTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
}
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
PADDLE_THROW
(
"Not support type: %s"
,
type
);
return
nullptr
;
return
nullptr
;
...
@@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
...
@@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
const
int
d
=
attr
->
d
;
const
int
d
=
attr
->
d
;
const
int
d2
=
d
*
2
;
const
int
d2
=
d
*
2
;
const
int
d3
=
d
*
3
;
const
int
d3
=
d
*
3
;
auto
vmul_d
=
Get
<
kVMul
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
auto
vmul_d
=
KernelFuncs
<
VMulTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
auto
vadd_d
=
Get
<
kVAdd
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
auto
vadd_d
=
KernelFuncs
<
VAddTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
auto
vadd_d2
=
Get
<
kVAdd
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d2
);
auto
vadd_d2
=
KernelFuncs
<
VAddTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d2
);
auto
act_gate_d
=
getActFunc
(
attr
->
act_gate
,
d
);
auto
act_gate_d
=
getActFunc
(
attr
->
act_gate
,
d
);
auto
act_gate_d2
=
getActFunc
(
attr
->
act_gate
,
d2
);
auto
act_gate_d2
=
getActFunc
(
attr
->
act_gate
,
d2
);
auto
act_gate_d3
=
getActFunc
(
attr
->
act_gate
,
d3
);
auto
act_gate_d3
=
getActFunc
(
attr
->
act_gate
,
d3
);
...
@@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
...
@@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
int
d
=
attr
->
d
;
int
d
=
attr
->
d
;
int
d2
=
d
*
2
;
int
d2
=
d
*
2
;
int
d3
=
d
*
3
;
int
d3
=
d
*
3
;
auto
vmul_d
=
Get
<
kVMul
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
auto
vmul_d
=
KernelFuncs
<
VMulTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
auto
vadd_d
=
Get
<
kVAdd
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
auto
vadd_d
=
KernelFuncs
<
VAddTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
auto
act_gate_d
=
getActFunc
(
attr
->
act_gate
,
d
);
auto
act_gate_d
=
getActFunc
(
attr
->
act_gate
,
d
);
auto
act_cand_d
=
getActFunc
(
attr
->
act_cand
,
d
);
auto
act_cand_d
=
getActFunc
(
attr
->
act_cand
,
d
);
auto
act_cell_d
=
getActFunc
(
attr
->
act_cell
,
d
);
auto
act_cell_d
=
getActFunc
(
attr
->
act_cell
,
d
);
...
@@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) {
...
@@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) {
int
d2
=
d
*
2
;
int
d2
=
d
*
2
;
auto
act_gate
=
getActFunc
(
attr
->
act_gate
,
d
);
auto
act_gate
=
getActFunc
(
attr
->
act_gate
,
d
);
auto
act_cand
=
getActFunc
(
attr
->
act_cand
,
d
);
auto
act_cand
=
getActFunc
(
attr
->
act_cand
,
d
);
auto
vmul_d
=
Get
<
kVMul
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
d
);
auto
vmul_d
=
KernelFuncs
<
VMulTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
act_gate
(
gates
,
gates
,
d
);
act_gate
(
gates
,
gates
,
d
);
act_cand
(
gates
+
d2
,
gates
+
d2
,
d
);
act_cand
(
gates
+
d2
,
gates
+
d2
,
d
);
vmul_d
(
gates
,
gates
+
d2
,
ht
,
d
);
vmul_d
(
gates
,
gates
+
d2
,
ht
,
d
);
...
@@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
...
@@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
T
*
ht
=
reinterpret_cast
<
T
*>
(
step
->
ht
);
const
T
*
ht_1
=
reinterpret_cast
<
const
T
*>
(
step
->
ht_1
);
const
T
*
ht_1
=
reinterpret_cast
<
const
T
*>
(
step
->
ht_1
);
auto
act_gate
=
getActFunc
(
attr
->
act_gate
,
attr
->
d
);
auto
act_gate
=
getActFunc
(
attr
->
act_gate
,
attr
->
d
);
auto
vmul_d
=
Get
<
kVMul
,
XYZNTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
->
d
);
auto
vmul_d
=
KernelFuncs
<
VMulTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
attr
->
d
);
act_gate
(
gates
+
attr
->
d
,
gates
+
attr
->
d
,
attr
->
d
);
act_gate
(
gates
+
attr
->
d
,
gates
+
attr
->
d
,
attr
->
d
);
vmul_d
(
ht_1
,
gates
+
attr
->
d
,
ht
,
attr
->
d
);
vmul_d
(
ht_1
,
gates
+
attr
->
d
,
ht
,
attr
->
d
);
}
}
...
@@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
...
@@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
}
}
// TODO(TJ): tuning me
// TODO(TJ): tuning me
bool
VSigmoidKernel
::
UseMe
(
const
int
&
d
)
const
{
return
true
;
}
bool
VSigmoidKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
return
true
;
}
bool
VTanhKernel
::
UseMe
(
const
int
&
d
)
const
{
return
true
;
}
bool
VTanhKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
return
true
;
}
bool
SoftmaxKernel
::
UseMe
(
const
int
&
d
)
const
{
return
true
;
}
bool
SoftmaxKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
return
true
;
}
bool
LSTMCtHtKernel
::
UseMe
(
const
lstm_attr_t
&
attr
)
const
{
return
true
;
}
bool
LSTMCtHtKernel
::
CanBeUsed
(
const
lstm_attr_t
&
attr
)
const
{
return
true
;
}
bool
LSTMC1H1Kernel
::
UseMe
(
const
lstm_attr_t
&
attr
)
const
{
return
true
;
}
bool
LSTMC1H1Kernel
::
CanBeUsed
(
const
lstm_attr_t
&
attr
)
const
{
return
true
;
}
bool
GRUH1Kernel
::
UseMe
(
const
gru_attr_t
&
attr
)
const
{
return
true
;
}
bool
GRUH1Kernel
::
CanBeUsed
(
const
gru_attr_t
&
attr
)
const
{
return
true
;
}
bool
GRUHtPart1Kernel
::
UseMe
(
const
gru_attr_t
&
attr
)
const
{
return
true
;
}
bool
GRUHtPart1Kernel
::
CanBeUsed
(
const
gru_attr_t
&
attr
)
const
{
return
true
;
}
bool
GRUHtPart2Kernel
::
UseMe
(
const
gru_attr_t
&
attr
)
const
{
return
true
;
}
bool
GRUHtPart2Kernel
::
CanBeUsed
(
const
gru_attr_t
&
attr
)
const
{
return
true
;
}
}
// namespace mix
}
// namespace mix
}
// namespace more
}
// namespace more
...
@@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
...
@@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
namespace
mix
=
paddle
::
operators
::
jit
::
more
::
mix
;
namespace
mix
=
paddle
::
operators
::
jit
::
more
::
mix
;
#define REGISTER_MORE_KERNEL(
key,
func) \
#define REGISTER_MORE_KERNEL(func) \
REGISTER_JITKERNEL_MORE(k
ey
, mix, mix::func##Kernel)
REGISTER_JITKERNEL_MORE(k
##func
, mix, mix::func##Kernel)
REGISTER_MORE_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MORE_KERNEL
(
VSigmoid
);
REGISTER_MORE_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MORE_KERNEL
(
VTanh
);
REGISTER_MORE_KERNEL
(
kSoftmax
,
Softmax
);
REGISTER_MORE_KERNEL
(
Softmax
);
REGISTER_MORE_KERNEL
(
kLSTMCtHt
,
LSTMCtHt
);
REGISTER_MORE_KERNEL
(
LSTMCtHt
);
REGISTER_MORE_KERNEL
(
kLSTMC1H1
,
LSTMC1H1
);
REGISTER_MORE_KERNEL
(
LSTMC1H1
);
REGISTER_MORE_KERNEL
(
kGRUH1
,
GRUH1
);
REGISTER_MORE_KERNEL
(
GRUH1
);
REGISTER_MORE_KERNEL
(
kGRUHtPart1
,
GRUHtPart1
);
REGISTER_MORE_KERNEL
(
GRUHtPart1
);
REGISTER_MORE_KERNEL
(
kGRUHtPart2
,
GRUHtPart2
);
REGISTER_MORE_KERNEL
(
GRUHtPart2
);
#undef REGISTER_MORE_KERNEL
#undef REGISTER_MORE_KERNEL
paddle/fluid/operators/jit/more/mix/mix.h
浏览文件 @
ccc7c358
...
@@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr);
...
@@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr);
void
GRUHtPart1
(
gru_t
*
step
,
const
gru_attr_t
*
attr
);
void
GRUHtPart1
(
gru_t
*
step
,
const
gru_attr_t
*
attr
);
void
GRUHtPart2
(
gru_t
*
step
,
const
gru_attr_t
*
attr
);
void
GRUHtPart2
(
gru_t
*
step
,
const
gru_attr_t
*
attr
);
#define DECLARE_MORE_KERNEL(name
, tuples)
\
#define DECLARE_MORE_KERNEL(name
)
\
class name##Kernel : public KernelMore<
tuples<T>> {
\
class name##Kernel : public KernelMore<
name##Tuple<T>> {
\
public: \
public:
\
name##Kernel() { this->func = name; } \
name##Kernel() { this->func = name; }
\
bool
UseMe(const typename tuples
<T>::attr_type&) const override; \
bool
CanBeUsed(const typename name##Tuple
<T>::attr_type&) const override; \
const char* ImplType() const override { return "Mixed"; } \
const char* ImplType() const override { return "Mixed"; }
\
}
}
// XYN
// XYN
DECLARE_MORE_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MORE_KERNEL
(
VSigmoid
);
DECLARE_MORE_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MORE_KERNEL
(
VTanh
);
// XRN
// XRN
DECLARE_MORE_KERNEL
(
Softmax
,
SoftmaxTuples
);
DECLARE_MORE_KERNEL
(
Softmax
);
DECLARE_MORE_KERNEL
(
LSTMCtHt
,
LSTMTuples
);
DECLARE_MORE_KERNEL
(
LSTMCtHt
);
DECLARE_MORE_KERNEL
(
LSTMC1H1
,
LSTMTuples
);
DECLARE_MORE_KERNEL
(
LSTMC1H1
);
DECLARE_MORE_KERNEL
(
GRUH1
,
GRUTuples
);
DECLARE_MORE_KERNEL
(
GRUH1
);
DECLARE_MORE_KERNEL
(
GRUHtPart1
,
GRUTuples
);
DECLARE_MORE_KERNEL
(
GRUHtPart1
);
DECLARE_MORE_KERNEL
(
GRUHtPart2
,
GRUTuples
);
DECLARE_MORE_KERNEL
(
GRUHtPart2
);
#undef DECLARE_MORE_KERNEL
#undef DECLARE_MORE_KERNEL
...
...
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
ccc7c358
...
@@ -130,105 +130,106 @@ void ASum<double>(const double* x, double* res, int n) {
...
@@ -130,105 +130,106 @@ void ASum<double>(const double* x, double* res, int n) {
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template
<
>
template
<
>
bool
VMulKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VMulKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
}
}
template
<
>
template
<
>
bool
VAddKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VAddKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
>
512
;
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
>
512
;
}
}
template
<
>
template
<
>
bool
VScalKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VScalKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
}
}
template
<
>
template
<
>
bool
VExpKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VExpKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
d
>
7
;
return
d
>
7
;
}
}
template
<
>
template
<
>
bool
VSquareKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VSquareKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
d
>
7
;
return
d
>
7
;
}
}
template
<
>
template
<
>
bool
VCopyKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VCopyKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
d
>
15
;
return
d
>
15
;
}
}
template
<
>
template
<
>
bool
VBroadcastKernel
<
float
>::
UseMe
(
const
int64_t
&
d
)
const
{
bool
VBroadcastKernel
<
float
>::
CanBeUsed
(
const
int64_t
&
d
)
const
{
return
d
>
127
;
return
d
>
127
;
}
}
template
<
>
template
<
>
bool
VBroadcastKernel
<
double
>::
UseMe
(
const
int64_t
&
attr
)
const
{
bool
VBroadcastKernel
<
double
>::
CanBeUsed
(
const
int64_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
VSigmoidKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VSigmoidKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
d
>
7
;
return
d
>
7
;
}
}
template
<
>
template
<
>
bool
VTanhKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VTanhKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
d
>
7
;
return
d
>
7
;
}
}
template
<
>
template
<
>
bool
SeqPoolKernel
<
float
>::
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
{
bool
SeqPoolKernel
<
float
>::
CanBeUsed
(
const
seq_pool_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
SeqPoolKernel
<
double
>::
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
{
bool
SeqPoolKernel
<
double
>::
CanBeUsed
(
const
seq_pool_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
EmbSeqPoolKernel
<
float
>::
UseMe
(
const
emb_seq_pool_attr_t
&
attr
)
const
{
bool
EmbSeqPoolKernel
<
float
>::
CanBeUsed
(
const
emb_seq_pool_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
EmbSeqPoolKernel
<
double
>::
UseMe
(
const
emb_seq_pool_attr_t
&
attr
)
const
{
bool
EmbSeqPoolKernel
<
double
>::
CanBeUsed
(
const
emb_seq_pool_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
SgdKernel
<
float
>::
UseMe
(
const
sgd_attr_t
&
attr
)
const
{
bool
SgdKernel
<
float
>::
CanBeUsed
(
const
sgd_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
SgdKernel
<
double
>::
UseMe
(
const
sgd_attr_t
&
attr
)
const
{
bool
SgdKernel
<
double
>::
CanBeUsed
(
const
sgd_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
MatMulKernel
<
float
>::
UseMe
(
const
matmul_attr_t
&
attr
)
const
{
bool
MatMulKernel
<
float
>::
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
);
return
platform
::
MayIUse
(
platform
::
avx
);
}
}
template
<
>
template
<
>
bool
MatMulKernel
<
double
>::
UseMe
(
const
matmul_attr_t
&
attr
)
const
{
bool
MatMulKernel
<
double
>::
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
{
return
true
;
return
true
;
}
}
template
<
>
template
<
>
bool
SoftmaxKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
SoftmaxKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
// tuned on avx2
// tuned on avx2
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
<
60
;
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
<
60
;
}
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
#define AWALYS_USE_ME_WITH_DOUBLE(func)
\
template <> \
template <>
\
bool func##Kernel<double>::
UseMe
(const int& d) const { \
bool func##Kernel<double>::
CanBeUsed
(const int& d) const { \
return true; \
return true;
\
}
}
AWALYS_USE_ME_WITH_DOUBLE
(
VMul
);
AWALYS_USE_ME_WITH_DOUBLE
(
VMul
);
...
@@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax);
...
@@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax);
namespace
mkl
=
paddle
::
operators
::
jit
::
more
::
mkl
;
namespace
mkl
=
paddle
::
operators
::
jit
::
more
::
mkl
;
#define REGISTER_MKL_KERNEL(
key, func)
\
#define REGISTER_MKL_KERNEL(
func)
\
REGISTER_JITKERNEL_MORE(k
ey
, mkl, mkl::func##Kernel<float>, \
REGISTER_JITKERNEL_MORE(k
##func
, mkl, mkl::func##Kernel<float>, \
mkl::func##Kernel<double>)
mkl::func##Kernel<double>)
REGISTER_MKL_KERNEL
(
kMatMul
,
MatMul
);
REGISTER_MKL_KERNEL
(
MatMul
);
REGISTER_MKL_KERNEL
(
kVMul
,
VMul
);
REGISTER_MKL_KERNEL
(
VMul
);
REGISTER_MKL_KERNEL
(
kVAdd
,
VAdd
);
REGISTER_MKL_KERNEL
(
VAdd
);
REGISTER_MKL_KERNEL
(
kVScal
,
VScal
);
REGISTER_MKL_KERNEL
(
VScal
);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
VExp
);
REGISTER_MKL_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_MKL_KERNEL
(
VSquare
);
REGISTER_MKL_KERNEL
(
kVCopy
,
VCopy
);
REGISTER_MKL_KERNEL
(
VCopy
);
REGISTER_MKL_KERNEL
(
kVBroadcast
,
VBroadcast
);
REGISTER_MKL_KERNEL
(
VBroadcast
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
VTanh
);
REGISTER_MKL_KERNEL
(
kSeqPool
,
SeqPool
);
REGISTER_MKL_KERNEL
(
SeqPool
);
REGISTER_MKL_KERNEL
(
kEmbSeqPool
,
EmbSeqPool
);
REGISTER_MKL_KERNEL
(
EmbSeqPool
);
REGISTER_MKL_KERNEL
(
kSoftmax
,
Softmax
);
REGISTER_MKL_KERNEL
(
Softmax
);
REGISTER_MKL_KERNEL
(
kSgd
,
Sgd
);
REGISTER_MKL_KERNEL
(
Sgd
);
#undef REGISTER_MKL_KERNEL
#undef REGISTER_MKL_KERNEL
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
ccc7c358
...
@@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
...
@@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
}
}
}
}
#define DECLARE_MKL_KERNEL(name
, tuples)
\
#define DECLARE_MKL_KERNEL(name
)
\
template <typename T> \
template <typename T>
\
class name##Kernel : public KernelMore<
tuples<T>> {
\
class name##Kernel : public KernelMore<
name##Tuple<T>> {
\
public: \
public:
\
name##Kernel() { this->func = name<T>; } \
name##Kernel() { this->func = name<T>; }
\
bool
UseMe(const typename tuples
<T>::attr_type&) const override; \
bool
CanBeUsed(const typename name##Tuple
<T>::attr_type&) const override; \
const char* ImplType() const override { return "MKL"; } \
const char* ImplType() const override { return "MKL"; }
\
}
}
// ABCMNK
// ABCMNK
DECLARE_MKL_KERNEL
(
MatMul
,
MatMulTuples
);
DECLARE_MKL_KERNEL
(
MatMul
);
// XYZN
// XYZN
DECLARE_MKL_KERNEL
(
VMul
,
XYZNTuples
);
DECLARE_MKL_KERNEL
(
VMul
);
DECLARE_MKL_KERNEL
(
VAdd
,
XYZNTuples
);
DECLARE_MKL_KERNEL
(
VAdd
);
// AXYN
// AXYN
DECLARE_MKL_KERNEL
(
VScal
,
AXYNTuples
);
DECLARE_MKL_KERNEL
(
VScal
);
// XYN
// XYN
DECLARE_MKL_KERNEL
(
VExp
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VExp
);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSigmoid
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
);
DECLARE_MKL_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSquare
);
DECLARE_MKL_KERNEL
(
VCopy
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VCopy
);
DECLARE_MKL_KERNEL
(
SeqPool
,
SeqPoolTuples
);
// others
DECLARE_MKL_KERNEL
(
SeqPool
);
DECLARE_MKL_KERNEL
(
EmbSeqPool
,
EmbSeqPoolTuples
);
DECLARE_MKL_KERNEL
(
EmbSeqPool
);
DECLARE_MKL_KERNEL
(
Softmax
);
DECLARE_MKL_KERNEL
(
Softmax
,
SoftmaxTuples
);
DECLARE_MKL_KERNEL
(
Sgd
);
DECLARE_MKL_KERNEL
(
VBroadcast
);
DECLARE_MKL_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_MKL_KERNEL
(
VBroadcast
,
VBroadcastTuples
);
#undef DECLARE_MKL_KERNEL
#undef DECLARE_MKL_KERNEL
...
...
paddle/fluid/operators/jit/refer/refer.cc
浏览文件 @
ccc7c358
...
@@ -17,51 +17,43 @@
...
@@ -17,51 +17,43 @@
namespace
refer
=
paddle
::
operators
::
jit
::
refer
;
namespace
refer
=
paddle
::
operators
::
jit
::
refer
;
#define REGISTER_REFER_KERNEL(
key, func)
\
#define REGISTER_REFER_KERNEL(
func)
\
REGISTER_JITKERNEL_REFER(k
ey
, refer::func##Kernel<float>, \
REGISTER_JITKERNEL_REFER(k
##func
, refer::func##Kernel<float>, \
refer::func##Kernel<double>)
refer::func##Kernel<double>)
REGISTER_REFER_KERNEL
(
kVMul
,
VMul
);
REGISTER_REFER_KERNEL
(
VMul
);
REGISTER_REFER_KERNEL
(
kVAdd
,
VAdd
);
REGISTER_REFER_KERNEL
(
VAdd
);
REGISTER_REFER_KERNEL
(
kVAddRelu
,
VAddRelu
);
REGISTER_REFER_KERNEL
(
VAddRelu
);
REGISTER_REFER_KERNEL
(
kVSub
,
VSub
);
REGISTER_REFER_KERNEL
(
VSub
);
REGISTER_REFER_KERNEL
(
kVScal
,
VScal
);
REGISTER_REFER_KERNEL
(
VScal
);
REGISTER_REFER_KERNEL
(
kVAddBias
,
VAddBias
);
REGISTER_REFER_KERNEL
(
VAddBias
);
REGISTER_REFER_KERNEL
(
kVRelu
,
VRelu
);
REGISTER_REFER_KERNEL
(
VRelu
);
REGISTER_REFER_KERNEL
(
kVCopy
,
VCopy
);
REGISTER_REFER_KERNEL
(
VCopy
);
REGISTER_REFER_KERNEL
(
kVIdentity
,
VIdentity
);
REGISTER_REFER_KERNEL
(
VIdentity
);
REGISTER_REFER_KERNEL
(
kVSquare
,
VSquare
);
REGISTER_REFER_KERNEL
(
VSquare
);
REGISTER_REFER_KERNEL
(
kVExp
,
VExp
);
REGISTER_REFER_KERNEL
(
VExp
);
REGISTER_REFER_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_REFER_KERNEL
(
VSigmoid
);
REGISTER_REFER_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_REFER_KERNEL
(
VTanh
);
REGISTER_REFER_KERNEL
(
kLSTMCtHt
,
LSTMCtHt
);
REGISTER_REFER_KERNEL
(
LSTMCtHt
);
REGISTER_REFER_KERNEL
(
kLSTMC1H1
,
LSTMC1H1
);
REGISTER_REFER_KERNEL
(
LSTMC1H1
);
REGISTER_REFER_KERNEL
(
kGRUH1
,
GRUH1
);
REGISTER_REFER_KERNEL
(
GRUH1
);
REGISTER_REFER_KERNEL
(
kGRUHtPart1
,
GRUHtPart1
);
REGISTER_REFER_KERNEL
(
GRUHtPart1
);
REGISTER_REFER_KERNEL
(
kGRUHtPart2
,
GRUHtPart2
);
REGISTER_REFER_KERNEL
(
GRUHtPart2
);
REGISTER_REFER_KERNEL
(
kCRFDecoding
,
CRFDecoding
);
REGISTER_REFER_KERNEL
(
CRFDecoding
);
REGISTER_REFER_KERNEL
(
kLayerNorm
,
LayerNorm
);
REGISTER_REFER_KERNEL
(
LayerNorm
);
REGISTER_REFER_KERNEL
(
NCHW16CMulNC
);
REGISTER_REFER_KERNEL
(
kNCHW16CMulNC
,
NCHW16CMulNC
);
REGISTER_REFER_KERNEL
(
SeqPool
);
REGISTER_REFER_KERNEL
(
MatMul
);
REGISTER_REFER_KERNEL
(
kSeqPool
,
SeqPool
);
REGISTER_REFER_KERNEL
(
HMax
);
REGISTER_REFER_KERNEL
(
HSum
);
REGISTER_REFER_KERNEL
(
kMatMul
,
MatMul
);
REGISTER_REFER_KERNEL
(
Softmax
);
REGISTER_REFER_KERNEL
(
EmbSeqPool
);
REGISTER_REFER_KERNEL
(
kHMax
,
HMax
);
REGISTER_REFER_KERNEL
(
Sgd
);
REGISTER_REFER_KERNEL
(
kHSum
,
HSum
);
REGISTER_REFER_KERNEL
(
VBroadcast
);
REGISTER_REFER_KERNEL
(
kSoftmax
,
Softmax
);
REGISTER_REFER_KERNEL
(
kEmbSeqPool
,
EmbSeqPool
);
REGISTER_REFER_KERNEL
(
kSgd
,
Sgd
);
REGISTER_REFER_KERNEL
(
kVBroadcast
,
VBroadcast
);
#undef REGISTER_REFER_KERNEL
#undef REGISTER_REFER_KERNEL
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
ccc7c358
...
@@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
...
@@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
}
}
}
}
#define DECLARE_REFER_KERNEL(name
, tuples)
\
#define DECLARE_REFER_KERNEL(name
)
\
template <typename T> \
template <typename T>
\
class name##Kernel : public ReferKernel<
tuples
<T>> { \
class name##Kernel : public ReferKernel<
name##Tuple
<T>> { \
public: \
public:
\
name##Kernel() { this->func = name<T>; } \
name##Kernel() { this->func = name<T>; }
\
}
}
// const T* x, const T* y, T* z, int n
// const T* x, const T* y, T* z, int n
DECLARE_REFER_KERNEL
(
VMul
,
XYZNTuples
);
DECLARE_REFER_KERNEL
(
VMul
);
DECLARE_REFER_KERNEL
(
VAdd
,
XYZNTuples
);
DECLARE_REFER_KERNEL
(
VAdd
);
DECLARE_REFER_KERNEL
(
VAddRelu
,
XYZNTuples
);
DECLARE_REFER_KERNEL
(
VAddRelu
);
DECLARE_REFER_KERNEL
(
VSub
,
XYZNTuples
);
DECLARE_REFER_KERNEL
(
VSub
);
// const T* a, const T* x, T* y, int n
// const T* a, const T* x, T* y, int n
DECLARE_REFER_KERNEL
(
VScal
,
AXYNTuples
);
DECLARE_REFER_KERNEL
(
VScal
);
DECLARE_REFER_KERNEL
(
VAddBias
,
AXYNTuples
);
DECLARE_REFER_KERNEL
(
VAddBias
);
// const T* x, T* y, int n
// const T* x, T* y, int n
DECLARE_REFER_KERNEL
(
VRelu
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VRelu
);
DECLARE_REFER_KERNEL
(
VIdentity
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VIdentity
);
DECLARE_REFER_KERNEL
(
VExp
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VExp
);
DECLARE_REFER_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VSigmoid
);
DECLARE_REFER_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VTanh
);
DECLARE_REFER_KERNEL
(
VSquare
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VSquare
);
DECLARE_REFER_KERNEL
(
VCopy
,
XYNTuples
);
DECLARE_REFER_KERNEL
(
VCopy
);
// lstm_t*, const lstm_attr_t*
// lstm_t*, const lstm_attr_t*
DECLARE_REFER_KERNEL
(
LSTMCtHt
,
LSTMTuples
);
DECLARE_REFER_KERNEL
(
LSTMCtHt
);
DECLARE_REFER_KERNEL
(
LSTMC1H1
,
LSTMTuples
);
DECLARE_REFER_KERNEL
(
LSTMC1H1
);
// gru_t*, const gru_attr_t*
// gru_t*, const gru_attr_t*
DECLARE_REFER_KERNEL
(
GRUH1
,
GRUTuples
);
DECLARE_REFER_KERNEL
(
GRUH1
);
DECLARE_REFER_KERNEL
(
GRUHtPart1
,
GRUTuples
);
DECLARE_REFER_KERNEL
(
GRUHtPart1
);
DECLARE_REFER_KERNEL
(
GRUHtPart2
,
GRUTuples
);
DECLARE_REFER_KERNEL
(
GRUHtPart2
);
DECLARE_REFER_KERNEL
(
CRFDecoding
,
CRFDecodingTuples
);
DECLARE_REFER_KERNEL
(
HMax
);
DECLARE_REFER_KERNEL
(
LayerNorm
,
LayerNormTuples
);
DECLARE_REFER_KERNEL
(
HSum
);
DECLARE_REFER_KERNEL
(
NCHW16CMulNC
,
NCHW16CMulNCTuples
);
// others
DECLARE_REFER_KERNEL
(
CRFDecoding
);
DECLARE_REFER_KERNEL
(
SeqPool
,
SeqPoolTuples
);
DECLARE_REFER_KERNEL
(
LayerNorm
);
DECLARE_REFER_KERNEL
(
NCHW16CMulNC
);
DECLARE_REFER_KERNEL
(
MatMul
,
MatMulTuples
);
DECLARE_REFER_KERNEL
(
SeqPool
);
DECLARE_REFER_KERNEL
(
MatMul
);
DECLARE_REFER_KERNEL
(
HMax
,
XRNTuples
);
DECLARE_REFER_KERNEL
(
Softmax
);
DECLARE_REFER_KERNEL
(
HSum
,
XRNTuples
);
DECLARE_REFER_KERNEL
(
EmbSeqPool
);
DECLARE_REFER_KERNEL
(
Sgd
);
DECLARE_REFER_KERNEL
(
Softmax
,
SoftmaxTuples
);
DECLARE_REFER_KERNEL
(
VBroadcast
);
DECLARE_REFER_KERNEL
(
EmbSeqPool
,
EmbSeqPoolTuples
);
DECLARE_REFER_KERNEL
(
Sgd
,
SgdTuples
);
DECLARE_REFER_KERNEL
(
VBroadcast
,
VBroadcastTuples
);
#undef DECLARE_REFER_KERNEL
#undef DECLARE_REFER_KERNEL
...
...
paddle/fluid/operators/jit/registry.h
浏览文件 @
ccc7c358
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <memory>
#include <memory>
#include <tuple>
#include <tuple>
#include <type_traits>
#include <type_traits>
#include <utility> // for std::move
#include "paddle/fluid/operators/jit/kernel_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
#include "paddle/fluid/operators/jit/kernel_pool.h"
#include "paddle/fluid/operators/jit/kernel_pool.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
...
@@ -49,8 +50,8 @@ struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
...
@@ -49,8 +50,8 @@ struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
void
operator
()(
KernelType
kt
)
const
{
void
operator
()(
KernelType
kt
)
const
{
KernelKey
kkey
(
kt
,
PlaceType
());
KernelKey
kkey
(
kt
,
PlaceType
());
Pool
().
Instance
().
Insert
(
kkey
,
Pool
::
Instance
().
Insert
(
kkey
,
std
::
move
(
make_unique
<
const
KERNEL_IMPL_TYPE
>
()));
std
::
move
(
make_unique
<
const
KERNEL_IMPL_TYPE
>
()));
constexpr
auto
size
=
std
::
tuple_size
<
std
::
tuple
<
KernelImpls
...
>>::
value
;
constexpr
auto
size
=
std
::
tuple_size
<
std
::
tuple
<
KernelImpls
...
>>::
value
;
JitKernelRegistrarFunctor
<
Pool
,
PlaceType
,
I
+
1
==
size
,
I
+
1
,
JitKernelRegistrarFunctor
<
Pool
,
PlaceType
,
I
+
1
==
size
,
I
+
1
,
KernelImpls
...
>
KernelImpls
...
>
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
ccc7c358
此差异已折叠。
点击以展开。
paddle/fluid/operators/layer_norm_op.h
浏览文件 @
ccc7c358
...
@@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
...
@@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ
(
bias
->
numel
(),
right
);
PADDLE_ENFORCE_EQ
(
bias
->
numel
(),
right
);
auto
ker
=
auto
ker
=
jit
::
Get
<
jit
::
kLayerNorm
,
jit
::
LayerNormTuples
<
T
>
,
platform
::
CPUPlace
>
(
jit
::
KernelFuncs
<
jit
::
LayerNormTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
()
right
);
.
At
(
right
);
ker
(
x
.
data
<
T
>
(),
out
.
data
<
T
>
(),
mean
->
data
<
T
>
(),
var
->
data
<
T
>
(),
ker
(
x
.
data
<
T
>
(),
out
.
data
<
T
>
(),
mean
->
data
<
T
>
(),
var
->
data
<
T
>
(),
scale
->
data
<
T
>
(),
bias
->
data
<
T
>
(),
static_cast
<
int
>
(
left
),
scale
->
data
<
T
>
(),
bias
->
data
<
T
>
(),
static_cast
<
int
>
(
left
),
static_cast
<
const
float
>
(
epsilon
),
right
);
static_cast
<
const
float
>
(
epsilon
),
right
);
...
...
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
ccc7c358
...
@@ -30,17 +30,16 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
...
@@ -30,17 +30,16 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return
;
return
;
}
}
if
(
relu
)
{
if
(
relu
)
{
auto
compute
=
jit
::
KernelFuncs
<
jit
::
kVAddRelu
,
jit
::
XYZNTuples
<
T
>
,
auto
compute
=
platform
::
CPUPlace
>::
Cache
()
jit
::
KernelFuncs
<
jit
::
VAddReluTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
.
At
(
N
);
N
);
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
T
*
dst
=
Y
+
i
*
N
;
compute
(
B
,
dst
,
dst
,
N
);
compute
(
B
,
dst
,
dst
,
N
);
}
}
}
else
{
}
else
{
auto
compute
=
jit
::
KernelFuncs
<
jit
::
kVAdd
,
jit
::
XYZNTuples
<
T
>
,
auto
compute
=
platform
::
CPUPlace
>::
Cache
()
jit
::
KernelFuncs
<
jit
::
VAddTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
N
);
.
At
(
N
);
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#pragma omp parallel for
#endif
#endif
...
...
paddle/fluid/operators/math/sequence_pooling.cc
浏览文件 @
ccc7c358
...
@@ -256,8 +256,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
...
@@ -256,8 +256,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
static_cast
<
int
>
(
input
.
numel
()
/
input
.
dims
()[
0
]),
static_cast
<
int
>
(
input
.
numel
()
/
input
.
dims
()[
0
]),
jit
::
SeqPoolType
::
kSum
);
jit
::
SeqPoolType
::
kSum
);
auto
seqpool
=
auto
seqpool
=
jit
::
Get
<
jit
::
kSeqPool
,
jit
::
SeqPoolTuples
<
T
>
,
platform
::
CPUPlace
>
(
jit
::
KernelFuncs
<
jit
::
SeqPoolTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
()
attr
);
.
At
(
attr
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
attr
.
h
=
static_cast
<
int
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
attr
.
h
=
static_cast
<
int
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
seqpool
(
src
,
dst
,
&
attr
);
seqpool
(
src
,
dst
,
&
attr
);
...
...
paddle/fluid/operators/math/softmax_impl.h
浏览文件 @
ccc7c358
...
@@ -82,8 +82,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
...
@@ -82,8 +82,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
const
int
kClassDim
=
1
;
const
int
kClassDim
=
1
;
// 2D data. Batch x C
// 2D data. Batch x C
auto
compute_softmax
=
auto
compute_softmax
=
jit
::
KernelFuncs
<
jit
::
kSoftmax
,
jit
::
SoftmaxTuples
<
float
>
,
jit
::
KernelFuncs
<
jit
::
SoftmaxTuple
<
float
>
,
platform
::
CPUPlace
>::
Cache
()
platform
::
CPUPlace
>::
Cache
()
.
At
(
in_dims
[
kClassDim
]);
.
At
(
in_dims
[
kClassDim
]);
compute_softmax
(
in_data
,
out_data
,
in_dims
[
kClassDim
],
in_dims
[
kBatchDim
]);
compute_softmax
(
in_data
,
out_data
,
in_dims
[
kClassDim
],
in_dims
[
kBatchDim
]);
}
}
...
...
paddle/fluid/operators/optimizers/sgd_op.h
浏览文件 @
ccc7c358
...
@@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
...
@@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
T
*
out_data
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
out_data
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
sgd
=
auto
sgd
=
jit
::
Get
<
jit
::
kSgd
,
jit
::
SgdTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
jit
::
KernelFuncs
<
jit
::
SgdTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
);
sgd
(
lr
,
param_data
,
grad_data
,
&
rows_idx
,
out_data
,
&
attr
);
sgd
(
lr
,
param_data
,
grad_data
,
&
rows_idx
,
out_data
,
&
attr
);
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
...
@@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
...
@@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ
(
attr
.
grad_width
,
attr
.
param_width
);
PADDLE_ENFORCE_EQ
(
attr
.
grad_width
,
attr
.
param_width
);
auto
sgd
=
auto
sgd
=
jit
::
Get
<
jit
::
kSgd
,
jit
::
SgdTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
jit
::
KernelFuncs
<
jit
::
SgdTuple
<
T
>
,
platform
::
CPUPlace
>::
Cache
().
At
(
attr
);
sgd
(
lr
,
param_data
,
grad_data
,
rows_data
,
out_data
,
&
attr
);
sgd
(
lr
,
param_data
,
grad_data
,
rows_data
,
out_data
,
&
attr
);
}
else
{
}
else
{
PADDLE_THROW
(
"Unsupported Variable Type of Grad"
);
PADDLE_THROW
(
"Unsupported Variable Type of Grad"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录