Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
6447155d
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6447155d
编写于
10月 17, 2018
作者:
T
tensor-tang
提交者:
GitHub
10月 17, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #13851 from tensor-tang/fea/jitkernel_peephole
Fea jitkernel lstm peephole
上级
7fb5b66a
bcb8ea39
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
2274 addition
and
360 deletion
+2274
-360
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/fusion_lstm_op.cc
paddle/fluid/operators/fusion_lstm_op.cc
+102
-261
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+4
-2
paddle/fluid/operators/math/cpu_lstm_compute.h
paddle/fluid/operators/math/cpu_lstm_compute.h
+0
-64
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+16
-19
paddle/fluid/operators/math/cpu_vec_test.cc
paddle/fluid/operators/math/cpu_vec_test.cc
+6
-10
paddle/fluid/operators/math/jit_kernel.cc
paddle/fluid/operators/math/jit_kernel.cc
+41
-0
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+142
-0
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+391
-0
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+400
-0
paddle/fluid/operators/math/jit_kernel_lstm.cc
paddle/fluid/operators/math/jit_kernel_lstm.cc
+308
-0
paddle/fluid/operators/math/jit_kernel_macro.h
paddle/fluid/operators/math/jit_kernel_macro.h
+111
-0
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+749
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+1
-1
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+1
-1
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-1
未找到文件。
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
6447155d
...
@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
...
@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
op_library
(
sequence_pad_op DEPS sequence_padding
)
op_library
(
sequence_pad_op DEPS sequence_padding
)
op_library
(
unstack_op DEPS stack_op
)
op_library
(
unstack_op DEPS stack_op
)
op_library
(
fake_quantize_op DEPS memory
)
op_library
(
fake_quantize_op DEPS memory
)
op_library
(
fusion_lstm_op DEPS
cpu_lstm_compute
)
op_library
(
fusion_lstm_op DEPS
jit_kernel
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
layer_norm_op DEPS cub
)
op_library
(
layer_norm_op DEPS cub
)
...
...
paddle/fluid/operators/fusion_lstm_op.cc
浏览文件 @
6447155d
...
@@ -15,11 +15,9 @@ limitations under the License. */
...
@@ -15,11 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -219,24 +217,8 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
...
@@ -219,24 +217,8 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
template
<
typename
T
>
template
<
typename
T
>
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
#define INIT_VEC_FUNC \
#define INIT_BASE_DEFINES \
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
using DeviceContext = paddle::platform::CPUDeviceContext; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
...
@@ -247,23 +229,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -247,23 +229,19 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes");
bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
#define INIT_BASE_SIZES \
auto x_dims = x->dims();
/* T x M*/
\
auto x_dims = x->dims();
/* T x M*/
\
auto wh_dims = wh->dims();
/* D x 4D*/
\
auto wh_dims = wh->dims();
/* D x 4D*/
\
const int M = x_dims[1]; \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const int D4 = wh_dims[1]
const int D3 = D * 3; \
const int D4 = wh_dims[1];
#define INIT_
BASE_INPUT_DATAS
\
#define INIT_
OTHER_DEFINES
\
const T* x_data = x->data<T>(); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/
\
/* diagonal weight*/
\
const T* w
c_data = bias->data<T>() + D4;
\
const T* w
p_data = bias->data<T>() + D4;
\
/* for peephole only*/
\
/* for peephole only*/
\
T* checked_cell_data = nullptr; \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
auto place = ctx.GetPlace(); \
...
@@ -271,69 +249,23 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -271,69 +249,23 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
}
} \
const auto& ker = \
/// Compute LSTM
math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
const std::string&, const std::string&>( \
ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("candidate_activation"), \
ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
// Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \
#define GEMM_WH_ADDON(bs, prev, out) \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
wh_data, D4, static_cast<T>(1), out, D4)
wh_data, D4, static_cast<T>(1), out, D4)
#define GET_Ct(ct_1, gates, ct) \
/* C_t = C_t-1 * fgated + cand_gated * igated*/
\
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, gates + D); \
blas.VMUL(D, ct_1, gates + D2, gates + D2); \
blas.VADD(D, gates + D, gates + D2, ct)
#define GET_Ht(ct, gates, ht) \
/* H_t = act_cell(C_t) * ogated */
\
act_cell(D, ct, gates + D2); \
blas.VMUL(D, gates + D2, gates + D3, ht)
#define GET_Ct_NOH0C0(gates, ct) \
/* C_t = igated * cgated*/
\
act_gate(D, gates + D, gates + D); \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, ct)
#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
/* get outgated, put W_oc * C_t on igated */
\
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
act_gate(D3, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \
/* get fgated and igated*/
\
blas.VMUL(D, wc_data, ct_1, checked_cell_data); \
blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
act_gate(D2, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
/* get ogated*/
\
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
INIT_BASE_DEFINES
;
INIT_BASE_INPUT_OUTPUT
INIT_OTHER_DEFINES
;
INIT_BASE_SIZES
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
auto
x_lod
=
x
->
lod
();
auto
x_lod
=
x
->
lod
();
const
int
total_T
=
x_dims
[
0
];
const
int
total_T
=
x_dims
[
0
];
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
...
@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
gate_offset
=
-
D
;
gate_offset
=
-
D
;
}
}
#define MOVE_ONE_STEP \
prev_h_data = h_out_data; \
prev_c_data = c_out_data; \
xx_data = xx_data + xx_offset; \
h_out_data = h_out_data + gate_offset; \
c_out_data = c_out_data + gate_offset
#define PROCESS_H0C0_DEFINES \
int bid = is_reverse ? N - 1 - i : i; \
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
const T* prev_c_data = nullptr; \
const T* prev_h_data = nullptr; \
int tstart = 0
#define PROCESS_H0C0_PEEPHOLE \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
#define PROCESS_H0C0 \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
if
(
use_peepholes
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
PROCESS_H0C0_PEEPHOLE
int
bid
=
is_reverse
?
N
-
1
-
i
:
i
;
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
int
seq_len
=
x_lod
[
0
][
bid
+
1
]
-
x_lod
[
0
][
bid
];
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
const
T
*
prev_c_data
=
nullptr
;
COMPUTE_CtHt_PEEPHOLE
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
);
const
T
*
prev_h_data
=
nullptr
;
MOVE_ONE_STEP
;
int
tstart
=
0
;
}
if
(
h0_data
)
{
}
prev_h_data
=
h0_data
+
bid
*
D
;
}
else
{
prev_c_data
=
c0_data
+
bid
*
D
;
// TODO(TJ): unly workaround, clean me
std
::
function
<
void
(
T
*
,
const
T
*
,
T
*
,
T
*
)
>
compute_ctht
;
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
)
&&
act_gate_str
==
"sigmoid"
&&
act_cand_str
==
"tanh"
&&
act_cell_str
==
"tanh"
&&
D
==
8
)
{
compute_ctht
=
math
::
lstm_compute_ctht
<
T
>
;
}
else
{
}
else
{
compute_ctht
=
[
&
](
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
)
{
ker
->
ComputeC1H1
(
xx_data
,
c_out_data
,
h_out_data
,
wp_data
);
COMPUTE_CtHt
(
gates
,
ct_1
,
ct
,
ht
);
tstart
=
1
;
};
// move one step
prev_h_data
=
h_out_data
;
prev_c_data
=
c_out_data
;
xx_data
=
xx_data
+
xx_offset
;
h_out_data
=
h_out_data
+
gate_offset
;
c_out_data
=
c_out_data
+
gate_offset
;
}
}
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
PROCESS_H0C0
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
compute_ctht
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
);
ker
->
ComputeCtHt
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
,
wp_data
,
MOVE_ONE_STEP
;
checked_cell_data
);
// move one step
prev_h_data
=
h_out_data
;
prev_c_data
=
c_out_data
;
xx_data
=
xx_data
+
xx_offset
;
h_out_data
=
h_out_data
+
gate_offset
;
c_out_data
=
c_out_data
+
gate_offset
;
}
}
}
}
}
}
#undef PROCESS_H0C0_DEFINES
#undef PROCESS_H0C0_PEEPHOLE
#undef PROCESS_H0C0
#undef MOVE_ONE_STEP
}
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
platform
::
CPUDeviceContext
;
INIT_BASE_DEFINES
;
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
xx
->
Resize
({
x_dims
[
0
],
D4
});
xx
->
Resize
({
x_dims
[
0
],
D4
});
SeqCompute
(
ctx
);
SeqCompute
(
ctx
);
return
;
return
;
}
}
INIT_VEC_FUNC
INIT_OTHER_DEFINES
;
INIT_BASE_INPUT_DATAS
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
reordered_c0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedC0"
);
auto
*
reordered_c0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedC0"
);
...
@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_c_data
=
reordered_c0_data
;
prev_c_data
=
reordered_c0_data
;
size_t
sz
=
sizeof
(
T
)
*
D
;
size_t
sz
=
sizeof
(
T
)
*
D
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
std
::
memcpy
(
reordered_h0_data
,
h0_data
+
seq_order
[
i
]
*
D
,
sz
);
blas
.
VCOPY
(
sz
,
h0_data
+
seq_order
[
i
]
*
D
,
reordered_h0_data
);
std
::
memcpy
(
reordered_c0_data
,
c0_data
+
seq_order
[
i
]
*
D
,
sz
);
blas
.
VCOPY
(
sz
,
c0_data
+
seq_order
[
i
]
*
D
,
reordered_c0_data
);
reordered_h0_data
+=
D
;
reordered_h0_data
+=
D
;
reordered_c0_data
+=
D
;
reordered_c0_data
+=
D
;
}
}
...
@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
GET_Ct_NOH0C0
(
cur_in_data
,
cur_c_out_data
);
ker
->
ComputeC1H1
(
cur_in_data
,
cur_c_out_data
,
cur_h_out_data
,
wp_data
);
if
(
use_peepholes
)
{
blas
.
VMUL
(
D
,
wc_data
+
D2
,
cur_c_out_data
,
cur_in_data
+
D
);
blas
.
VADD
(
D
,
cur_in_data
+
D
,
cur_in_data
+
D3
,
cur_in_data
+
D3
);
}
act_gate
(
D
,
cur_in_data
+
D3
,
cur_in_data
+
D3
);
GET_Ht
(
cur_c_out_data
,
cur_in_data
,
cur_h_out_data
);
cur_in_data
+=
D4
;
cur_in_data
+=
D4
;
cur_c_out_data
+=
D
;
cur_c_out_data
+=
D
;
cur_h_out_data
+=
D
;
cur_h_out_data
+=
D
;
...
@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_h_data
=
batched_h_out_data
;
prev_h_data
=
batched_h_out_data
;
prev_c_data
=
batched_c_out_data
;
prev_c_data
=
batched_c_out_data
;
}
}
// compute kernel part
const
auto
&
batch_starts
=
batched_lod
[
0
];
const
auto
&
batch_starts
=
batched_lod
[
0
];
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
const
int
offset
=
tstart
*
max_bs
*
D
;
const
int
offset
=
tstart
*
max_bs
*
D
;
batched_input_data
=
batched_input_data
+
offset
*
4
;
batched_input_data
=
batched_input_data
+
offset
*
4
;
batched_h_out_data
=
batched_h_out_data
+
offset
;
batched_h_out_data
=
batched_h_out_data
+
offset
;
batched_c_out_data
=
batched_c_out_data
+
offset
;
batched_c_out_data
=
batched_c_out_data
+
offset
;
#define DEFINE_CUR \
T* cur_in_data = batched_input_data; \
T* cur_prev_c_data = prev_c_data; \
T* cur_c_out_data = batched_c_out_data; \
T* cur_h_out_data = batched_h_out_data
#define MOVE_ONE_BATCH \
cur_in_data += D4; \
cur_prev_c_data += D; \
cur_c_out_data += D; \
cur_h_out_data += D
#define MOVE_ONE_STEP \
prev_c_data = batched_c_out_data; \
prev_h_data = batched_h_out_data; \
batched_c_out_data = cur_c_out_data; \
batched_h_out_data = cur_h_out_data; \
batched_input_data = cur_in_data
if
(
use_peepholes
)
{
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
GEMM_WH_ADDON
(
cur_bs
,
prev_h_data
,
batched_input_data
);
DEFINE_CUR
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
COMPUTE_CtHt_PEEPHOLE
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
cur_h_out_data
);
MOVE_ONE_BATCH
;
}
MOVE_ONE_STEP
;
}
}
else
{
// TODO(TJ): unly workaround, clean me
std
::
function
<
void
(
T
*
,
const
T
*
,
T
*
,
T
*
)
>
compute_ctht
;
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
)
&&
act_gate_str
==
"sigmoid"
&&
act_cand_str
==
"tanh"
&&
act_cell_str
==
"tanh"
&&
D
==
8
)
{
compute_ctht
=
math
::
lstm_compute_ctht
<
T
>
;
}
else
{
compute_ctht
=
[
&
](
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
)
{
COMPUTE_CtHt
(
gates
,
ct_1
,
ct
,
ht
);
};
}
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
GEMM_WH_ADDON
(
cur_bs
,
prev_h_data
,
batched_input_data
);
GEMM_WH_ADDON
(
cur_bs
,
prev_h_data
,
batched_input_data
);
DEFINE_CUR
;
T
*
cur_in_data
=
batched_input_data
;
T
*
cur_prev_c_data
=
prev_c_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
compute_ctht
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
ker
->
ComputeCtHt
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
cur_h_out_data
);
cur_h_out_data
,
wp_data
,
checked_cell_data
);
MOVE_ONE_BATCH
;
// move one batch
}
cur_in_data
+=
D4
;
MOVE_ONE_STEP
;
cur_prev_c_data
+=
D
;
cur_c_out_data
+=
D
;
cur_h_out_data
+=
D
;
}
}
// move one step
prev_c_data
=
batched_c_out_data
;
prev_h_data
=
batched_h_out_data
;
batched_c_out_data
=
cur_c_out_data
;
batched_h_out_data
=
cur_h_out_data
;
batched_input_data
=
cur_in_data
;
}
}
#undef MOVE_ONE_STEP
#undef MOVE_ONE_BATCH
#undef DEFINE_CUR
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batched_h_out
->
set_lod
(
batched_lod
);
batched_h_out
->
set_lod
(
batched_lod
);
...
@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
}
}
}
}
#undef COMPUTE_CtHt_PEEPHOLE
#undef COMPUTE_CtHt
#undef GET_Ct_NOH0C0
#undef COMPUTE_CtHt_NOH0C0
#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
#undef GET_Ht
#undef GET_Ct
#undef GEMM_WH_ADDON
#undef GEMM_WH_ADDON
#undef INIT_BASE_INPUT_DATAS
#undef INIT_OTHER_DEFINES
#undef INIT_BASE_SIZES
#undef INIT_BASE_DEFINES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
6447155d
...
@@ -45,8 +45,6 @@ math_library(im2col)
...
@@ -45,8 +45,6 @@ math_library(im2col)
if
(
NOT WIN32
)
# windows do not support avx functions yet.
if
(
NOT WIN32
)
# windows do not support avx functions yet.
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
lstm_compute DEPS activation_functions
)
math_library
(
lstm_compute DEPS activation_functions
)
# TODO(TJ): ugly workaround, clean me
cc_library
(
cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto device_context
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto device_context
)
...
@@ -76,3 +74,7 @@ if(WITH_GPU)
...
@@ -76,3 +74,7 @@ if(WITH_GPU)
endif
()
endif
()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat
)
cc_test
(
concat_test SRCS concat_test.cc DEPS concat
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
cc_library
(
jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
DEPS cpu_info cblas activation_functions
)
cc_test
(
jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel
)
paddle/fluid/operators/math/cpu_lstm_compute.h
已删除
100644 → 0
浏览文件 @
7fb5b66a
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
// TODO(TJ): ugly workaround, clean me
template
<
typename
T
>
void
lstm_compute_ctht
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
)
{
// gates: W_ch, W_ih, W_fh, W_oh
vec_sigmoid
<
T
,
platform
::
jit
::
avx
>
(
24
,
gates
+
8
,
gates
+
8
);
vec_tanh
<
T
,
platform
::
jit
::
avx
>
(
8
,
gates
,
gates
);
const
T
*
i
=
gates
+
8
,
*
f
=
gates
+
16
,
*
o
=
gates
+
24
;
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
d
=
0
;
d
<
8
;
++
d
)
{
// C_t = C_t-1 * fgated + cand_gated * igated
ct
[
d
]
=
ct_1
[
d
]
*
f
[
d
]
+
gates
[
d
]
*
i
[
d
];
// H_t = act_cell(C_t) * ogated
T
tmp
=
ct
[
d
]
*
2
;
tmp
=
static_cast
<
T
>
(
0
)
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
vec_exp
<
T
>
(
1
,
&
tmp
,
&
tmp
);
tmp
=
static_cast
<
T
>
(
2
)
/
(
static_cast
<
T
>
(
1
)
+
tmp
)
-
static_cast
<
T
>
(
1
);
ht
[
d
]
=
tmp
*
o
[
d
];
}
}
#ifdef __AVX__
namespace
detail
{
namespace
forward
{
namespace
avx
{
__m256
Sigmoid
(
const
__m256
a
);
__m256
Tanh
(
const
__m256
a
);
}
// namespace avx
}
// namespace forward
}
// namespace detail
template
<
>
void
lstm_compute_ctht
<
float
>
(
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
);
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cpu_vec.h
浏览文件 @
6447155d
...
@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
...
@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
}
}
template
<
>
template
<
>
inline
void
vec_scal
<
float
,
platform
::
jit
::
avx512_common
>
(
const
int
n
,
inline
void
vec_scal
<
float
,
platform
::
jit
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_scal
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
vec_scal
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
}
}
...
@@ -181,7 +179,7 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
...
@@ -181,7 +179,7 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
}
}
template
<
>
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx512
_common
>
(
const
int
n
,
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
a
,
const
float
a
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
...
@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
...
@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
}
}
template
<
>
template
<
>
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx512
_common
>
(
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
...
@@ -296,7 +294,7 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
...
@@ -296,7 +294,7 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
}
}
template
<
>
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
jit
::
avx512
_common
>
(
const
int
n
,
inline
void
vec_add_bias
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
a
,
const
float
a
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
...
@@ -390,7 +388,7 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
...
@@ -390,7 +388,7 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
}
}
template
<
>
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
jit
::
avx512
_common
>
(
const
int
n
,
inline
void
vec_sigmoid
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
...
@@ -454,8 +452,7 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
...
@@ -454,8 +452,7 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
}
}
template
<
>
template
<
>
inline
void
vec_relu
<
float
,
platform
::
jit
::
avx512_common
>
(
const
int
n
,
inline
void
vec_relu
<
float
,
platform
::
jit
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_relu
<
float
,
platform
::
jit
::
avx2
>
(
n
,
x
,
y
);
vec_relu
<
float
,
platform
::
jit
::
avx2
>
(
n
,
x
,
y
);
...
...
paddle/fluid/operators/math/cpu_vec_test.cc
浏览文件 @
6447155d
...
@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
...
@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
_common
>
,
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
f
>
,
ref_sigmoid
<
float
>
);
ref_sigmoid
<
float
>
);
}
}
TestAndBench
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
TestAndBench
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
...
@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
...
@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512_common
>
,
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512f
>
,
ref_tanh
<
float
>
);
ref_tanh
<
float
>
);
}
}
TestAndBench
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
TestAndBench
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
}
}
...
@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
...
@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512_common
>
,
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512f
>
,
ref_relu
<
float
>
);
ref_relu
<
float
>
);
}
}
TestAndBench
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
TestAndBench
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
}
}
...
@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
...
@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
_common
>
,
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
f
>
,
ref_sigmoid
<
float
>
);
ref_sigmoid
<
float
>
);
}
}
TestInplace
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
TestInplace
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
...
@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
...
@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512_common
>
,
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512f
>
,
ref_tanh
<
float
>
);
ref_tanh
<
float
>
);
}
}
TestInplace
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
TestInplace
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
}
}
...
@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
...
@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512_common
>
,
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512f
>
,
ref_relu
<
float
>
);
ref_relu
<
float
>
);
}
}
TestInplace
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
TestInplace
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
}
}
paddle/fluid/operators/math/
cpu_lstm_compute
.cc
→
paddle/fluid/operators/math/
jit_kernel
.cc
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <iostream>
#include <string>
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
#ifdef __AVX__
namespace
jitkernel
{
template
<
>
void
lstm_compute_ctht
<
float
>
(
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
namespace
jit
=
platform
::
jit
;
float
*
ht
)
{
namespace
act
=
detail
::
forward
::
avx
;
KernelPool
&
KernelPool
::
Instance
()
{
// gates: W_ch, W_ih, W_fh, W_oh
static
thread_local
KernelPool
g_jit_kernels
;
__m256
c
,
i
,
f
,
o
;
return
g_jit_kernels
;
c
=
_mm256_loadu_ps
(
gates
);
}
i
=
_mm256_loadu_ps
(
gates
+
8
);
f
=
_mm256_loadu_ps
(
gates
+
16
);
std
::
shared_ptr
<
const
Kernel
>
KernelPool
::
Get
(
const
std
::
string
&
key
)
const
{
o
=
_mm256_loadu_ps
(
gates
+
24
);
if
(
kers_
.
find
(
key
)
==
kers_
.
end
())
{
return
nullptr
;
/* C_t = C_t-1 * fgated + cand_gated * igated*/
}
c
=
_mm256_mul_ps
(
act
::
Tanh
(
c
),
act
::
Sigmoid
(
i
));
return
kers_
.
at
(
key
);
i
=
_mm256_loadu_ps
(
ct_1
);
f
=
_mm256_mul_ps
(
i
,
act
::
Sigmoid
(
f
));
f
=
_mm256_add_ps
(
c
,
f
);
_mm256_storeu_ps
(
ct
,
f
);
/* H_t = act_cell(C_t) * ogated */
o
=
_mm256_mul_ps
(
act
::
Tanh
(
f
),
act
::
Sigmoid
(
o
));
_mm256_storeu_ps
(
ht
,
o
);
}
}
#endif
}
// namespace jitkernel
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel.h
0 → 100644
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <memory> // for shared_ptr
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/macros.h"
// Note: Only support on CPU yet.
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define AVX_FLOAT_BLOCK 8
#define AVX2_FLOAT_BLOCK 8
#define AVX512_FLOAT_BLOCK 16
typedef
enum
{
kLT8
,
kEQ8
,
kGT8LT16
,
kEQ16
,
kGT16
}
jit_block
;
class
Kernel
{
public:
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
int
num_
{
0
};
int
end_
{
0
};
int
rest_
{
0
};
DISABLE_COPY_AND_ASSIGN
(
Kernel
);
};
class
KernelPool
{
public:
static
KernelPool
&
Instance
();
template
<
typename
Ker
,
typename
...
ARGS
>
std
::
shared_ptr
<
const
Ker
>
Get
(
ARGS
...
args
);
std
::
shared_ptr
<
const
Kernel
>
Get
(
const
std
::
string
&
key
)
const
;
private:
KernelPool
()
=
default
;
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
const
Kernel
>>
kers_
;
DISABLE_COPY_AND_ASSIGN
(
KernelPool
);
};
template
<
typename
T
>
class
VMulKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
class
VAddKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
class
VScalKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
T
*
x
)
const
=
0
;
};
template
<
typename
T
>
class
VAddBiasKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VActKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VReluKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VIdentityKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VExpKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VSigmoidKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VTanhKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
LSTMKernel
:
public
Kernel
{
public:
virtual
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
/* below only used in peephole*/
const
T
*
wp_data
=
nullptr
,
T
*
checked
=
nullptr
)
const
=
0
;
// compute c1 and h1 without c0 or h0
virtual
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
/* below only used in peephole*/
const
T
*
wp_data
=
nullptr
)
const
=
0
;
};
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_blas.cc
0 → 100644
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
/* VMUL JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VMulKernelImpl
:
public
VMulKernel
<
T
>
{
public:
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VMulKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsMul(this->num_, x, y, z); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VMulKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdMul(this->num_, x, y, z); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VMulKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_mul_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
// avx > for > mkl
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VADD JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddKernelImpl
:
public
VAddKernel
<
T
>
{
public:
explicit
VAddKernelImpl
(
int
d
)
:
VAddKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VAddKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsAdd(this->num_, x, y, z); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VAddKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(this->num_, x, y, z); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VAddKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_add_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VSCAL JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VScalKernelImpl
:
public
VScalKernel
<
T
>
{
public:
explicit
VScalKernelImpl
(
int
d
)
:
VScalKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
void
Compute
(
const
T
a
,
T
*
x
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VScalKernelImpl<float, isa, block>::Compute(const float a, float* x) \
const { \
platform::dynload::cblas_sscal(this->num_, a, x, 1); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VScalKernelImpl<double, isa, block>::Compute(const double a, double* x) \
const { \
platform::dynload::cblas_dscal(this->num_, a, x, 1); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI8_INPLACE_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute(const float a, float* x) \
const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(x, tmp); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI8_INPLACE_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI8_INPLACE_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI8_INPLACE_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI8_INPLACE_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VAddBias JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddBiasKernelImpl
:
public
VAddBiasKernel
<
T
>
{
public:
explicit
VAddBiasKernelImpl
(
int
d
)
:
VAddBiasKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \
tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
/* VRelu JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VReluKernelImpl
:
public
VReluKernel
<
T
>
{
public:
explicit
VReluKernelImpl
(
int
d
)
:
VReluKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + this->rest_, tmp1); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp); \
} \
__m256 tmp = _mm256_loadu_ps(x + this->rest_); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + this->rest_, tmp); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx2
);
INTRI_GT16_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx512f
);
INTRI_GT16_FLOAT
(
jit
::
avx512f
);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
/* An empty JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VIdentityKernelImpl
:
public
VIdentityKernel
<
T
>
{
public:
explicit
VIdentityKernelImpl
(
int
d
)
:
VIdentityKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
};
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL
(
videntity
,
VIdentityKernel
);
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_exp.cc
0 → 100644
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <cmath> // for exp
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
#ifdef __AVX__
namespace
detail
{
__m256
Exp
(
__m256
a
);
}
// namespace detail
#endif
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
/* VExp JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VExpKernelImpl
:
public
VExpKernel
<
T
>
{
public:
explicit
VExpKernelImpl
(
int
d
)
:
VExpKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
const { \
platform::dynload::vsExp(this->num_, x, y); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
const { \
platform::dynload::vdExp(this->num_, x, y); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kLT8
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT8LT16
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
_mm256_storeu_ps(y, detail::Exp(tmp)); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = detail::Exp(tmp0); \
tmp1 = detail::Exp(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
REGISTER_JITKERNEL
(
vexp
,
VExpKernel
);
/* VSigmoid JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VSigmoidKernelImpl
:
public
VSigmoidKernel
<
T
>
{
public:
explicit
VSigmoidKernelImpl
(
int
d
)
:
VSigmoidKernel
<
T
>
()
{
this
->
num_
=
d
;
vexp_
=
KernelPool
::
Instance
().
template
Get
<
VExpKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vexp_
->
Compute
(
y
,
y
);
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
private:
std
::
shared_ptr
<
const
VExpKernel
<
T
>>
vexp_
;
};
#define INTRI_SIGMOID(tmp, min, max) \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
tmp = detail::Exp(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max); \
INTRI_SIGMOID(tmp1, min, max); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
// INTRI_GT8LT16_FLOAT(jit::avx2);
// INTRI_GT16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
// INTRI_GT8LT16_FLOAT(jit::avx512f);
// INTRI_GT16_FLOAT(jit::avx512f);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
#undef INTRI_VSIGMOID
REGISTER_JITKERNEL
(
vsigmoid
,
VSigmoidKernel
);
/* VTanh JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VTanhKernelImpl
:
public
VTanhKernel
<
T
>
{
public:
explicit
VTanhKernelImpl
(
int
d
)
:
VTanhKernel
<
T
>
()
{
this
->
num_
=
d
;
vscal_
=
KernelPool
::
Instance
().
template
Get
<
VScalKernel
<
T
>
>
(
d
);
vsigmoid_
=
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
d
);
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
y
,
y
);
vscal_
->
Compute
(
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
private:
std
::
shared_ptr
<
const
VScalKernel
<
T
>>
vscal_
;
std
::
shared_ptr
<
const
VSigmoidKernel
<
T
>>
vsigmoid_
;
std
::
shared_ptr
<
const
VAddBiasKernel
<
T
>>
vaddbias_
;
};
#define INTRI_VTANH(tmp) \
tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \
tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
tmp = detail::Exp(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_VTANH(tmp0); \
INTRI_VTANH(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
vscal_->Compute(2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(2.f, y); \
vaddbias_->Compute(-1.f, y, y); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += this->end_; \
y += this->end_; \
vscal_->Compute(2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(2.f, y); \
vaddbias_->Compute(-1.f, y, y); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
// maybe use avx at gt8lt16 and gt16
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
// maybe use avx at gt8lt16 and gt16
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
#undef INTRI_VTANH
REGISTER_JITKERNEL
(
vtanh
,
VTanhKernel
);
#undef JITKERNEL_NEW_ACT_IMPL
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_lstm.cc
0 → 100644
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
#ifdef __AVX__
namespace
detail
{
__m256
Exp
(
__m256
a
);
}
// namespace detail
#endif
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
#ifdef __AVX__
typedef
enum
{
kSigmoid
,
kRelu
,
kTanh
,
kIdentity
}
act_type
;
class
AVXAct
{
public:
virtual
~
AVXAct
()
=
default
;
virtual
__m256
Compute
(
__m256
x
)
const
=
0
;
};
template
<
act_type
type
>
class
AVXActImpl
:
public
AVXAct
{
public:
__m256
Compute
(
__m256
x
)
const
override
{
PADDLE_THROW
(
"Unkown type!"
);
}
};
template
<
>
__m256
AVXActImpl
<
kSigmoid
>::
Compute
(
__m256
x
)
const
{
__m256
ones
=
_mm256_set1_ps
(
1.0
f
);
x
=
_mm256_max_ps
(
x
,
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MIN
));
x
=
_mm256_min_ps
(
x
,
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MAX
));
x
=
_mm256_sub_ps
(
_mm256_set1_ps
(
0.0
f
),
x
);
x
=
detail
::
Exp
(
x
);
x
=
_mm256_add_ps
(
ones
,
x
);
return
_mm256_div_ps
(
ones
,
x
);
}
template
<
>
__m256
AVXActImpl
<
kTanh
>::
Compute
(
__m256
x
)
const
{
__m256
ones
=
_mm256_set1_ps
(
1.0
f
);
x
=
_mm256_mul_ps
(
_mm256_set1_ps
(
-
2.0
f
),
x
);
x
=
_mm256_min_ps
(
x
,
_mm256_set1_ps
(
EXP_MAX_INPUT
));
x
=
detail
::
Exp
(
x
);
x
=
_mm256_add_ps
(
ones
,
x
);
x
=
_mm256_div_ps
(
_mm256_set1_ps
(
2.0
f
),
x
);
return
_mm256_sub_ps
(
x
,
ones
);
}
template
<
>
__m256
AVXActImpl
<
kRelu
>::
Compute
(
__m256
x
)
const
{
return
_mm256_max_ps
(
x
,
_mm256_setzero_ps
());
}
template
<
>
__m256
AVXActImpl
<
kIdentity
>::
Compute
(
__m256
x
)
const
{
return
x
;
}
#endif
template
<
typename
T
>
static
std
::
shared_ptr
<
const
VActKernel
<
T
>>
GetActKernel
(
const
std
::
string
&
type
,
int
n
)
{
if
(
type
==
"sigmoid"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"relu"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VReluKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"tanh"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VTanhKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VIdentityKernel
<
T
>
>
(
n
));
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
return
nullptr
;
}
/* LSTM JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
LSTMKernelImpl
:
public
LSTMKernel
<
T
>
{
public:
explicit
LSTMKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_cand
,
const
std
::
string
&
act_cell
,
int
d
)
:
LSTMKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
d3_
=
d
*
3
;
act_gate_d3_
=
GetActKernel
<
T
>
(
act_gate
,
d3_
);
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_cand_d_
=
GetActKernel
<
T
>
(
act_cand
,
d
);
act_cell_d_
=
GetActKernel
<
T
>
(
act_cell
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
vadd_d_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d
);
#ifdef __AVX__
auto
GetAVXAct
=
[
&
](
const
std
::
string
&
type
)
->
std
::
unique_ptr
<
AVXAct
>
{
if
(
type
==
"sigmoid"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kSigmoid
>
());
}
else
if
(
type
==
"relu"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kRelu
>
());
}
else
if
(
type
==
"tanh"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kTanh
>
());
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kIdentity
>
());
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
};
avx_act_gate_
=
GetAVXAct
(
act_gate
);
avx_act_cand_
=
GetAVXAct
(
act_cand
);
avx_act_cell_
=
GetAVXAct
(
act_cell
);
#endif
}
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
// gates: W_ch, W_ih, W_fh, W_oh
act_gate_d3_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated */
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
private:
int
d_
,
d2_
,
d3_
;
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d3_
,
act_gate_d_
,
act_cand_d_
,
act_cell_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
std
::
shared_ptr
<
const
VAddKernel
<
T
>>
vadd_d_
;
#ifdef __AVX__
std
::
unique_ptr
<
const
AVXAct
>
avx_act_gate_
,
avx_act_cand_
,
avx_act_cell_
;
#endif
};
#define INTRI8_FLOAT(isa) \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \
float* gates, const float* ct_1, float* ct, float* ht, \
const float* wp_data, float* checked) const { \
/* gates: W_ch, W_ih, W_fh, W_oh */
\
__m256 c, i, f, o; \
c = _mm256_loadu_ps(gates); \
i = _mm256_loadu_ps(gates + 8); \
f = _mm256_loadu_ps(gates + 16); \
o = _mm256_loadu_ps(gates + 24); \
/* C_t = C_t-1 * fgated + cand_gated * igated*/
\
c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
i = _mm256_loadu_ps(ct_1); \
f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \
f = _mm256_add_ps(c, f); \
_mm256_storeu_ps(ct, f); \
/* H_t = act_cell(C_t) * ogated */
\
o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
}
// TODO(TJ): optimize keq16
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
/* Peephole JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
PeepholeKernelImpl
:
public
LSTMKernel
<
T
>
{
public:
explicit
PeepholeKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_cand
,
const
std
::
string
&
act_cell
,
int
d
)
:
LSTMKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
d3_
=
d
*
3
;
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_cand_d_
=
GetActKernel
<
T
>
(
act_cand
,
d
);
act_cell_d_
=
GetActKernel
<
T
>
(
act_cell
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
vadd_d_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d
);
vadd_d2_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d2_
);
act_gate_d2_
=
GetActKernel
<
T
>
(
act_gate
,
d2_
);
}
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
/* get fgated and igated*/
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
/* get ogated*/
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
);
/* get outgated, put W_oc * C_t on igated */
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
private:
int
d_
,
d2_
,
d3_
;
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d2_
,
act_gate_d_
,
act_cand_d_
,
act_cell_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
std
::
shared_ptr
<
const
VAddKernel
<
T
>>
vadd_d_
,
vadd_d2_
;
};
#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const LSTMKernel<ker_dtype>> \
KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&, \
const std::string&, const std::string&, int, bool>( \
const std::string& act_gate, const std::string& act_cand, \
const std::string& act_cell, int d, bool use_peephole)
#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
(use_peephole ? "p" : "n")
#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \
if (use_peephole) { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<PeepholeKernelImpl<dtype, isa, k>>( \
act_gate, act_cand, act_cell, d)); \
} else { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
act_cell, d)); \
}
REGISTER_JITKERNEL_ARGS
(
lstm
,
LSTMKernel
,
JITKERNEL_DECLARE_LSTM
,
JITKERNEL_KEY_LSTM
,
JITKERNEL_NEW_LSTM_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_DECLARE_LSTM
#undef JITKERNEL_KEY_LSTM
#undef JITKERNEL_NEW_LSTM_IMPL
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_macro.h
0 → 100644
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kLT8); \
} else if (d == AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ8); \
} else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kGT8LT16); \
} else if (d == AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ16); \
} else { \
macro_(ker, dtype, isa, kGT16); \
}
#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \
if (jit::MayIUse(jit::avx512f)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
} else if (jit::MayIUse(jit::avx2)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \
} else if (jit::MayIUse(jit::avx)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \
} else { \
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
}
#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const ker_class<ker_dtype>> \
KernelPool::Get<ker_class<ker_dtype>, int>(int d)
#define JITKERNEL_KEY(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d)
#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(d))
#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
marco_declare, macro_key, macro_impl) \
marco_declare(ker_class, ker_dtype) { \
std::string key = macro_key(ker_key, dtype_key); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
}
#define REGISTER_JITKERNEL(ker_key, ker_class) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \
macro_impl) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
macro_impl); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \
macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \
macro_(jit::avx2, block); \
macro_(jit::avx, block); \
macro_(jit::isa_any, block)
#define FOR_EACH_BLOCK(macro_, isa) \
macro_(isa, kLT8); \
macro_(isa, kEQ8); \
macro_(isa, kGT8LT16); \
macro_(isa, kEQ16); \
macro_(isa, kGT16)
#define FOR_EACH_ISA_BLOCK(macro_) \
FOR_EACH_BLOCK(macro_, jit::avx512f); \
FOR_EACH_BLOCK(macro_, jit::avx2); \
FOR_EACH_BLOCK(macro_, jit::avx); \
FOR_EACH_BLOCK(macro_, jit::isa_any)
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_test.cc
0 → 100644
浏览文件 @
6447155d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <sys/time.h>
#include <cmath> // for exp
#include <cstring> // for memcpy
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
constexpr
int
repeat
=
20000
;
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+6
*
time
.
tv_sec
+
time
.
tv_usec
;
}
template
<
typename
T
>
void
RandomVec
(
const
int
n
,
T
*
a
,
const
T
lower
=
static_cast
<
T
>
(
-
20.
f
),
const
T
upper
=
static_cast
<
T
>
(
20.
f
))
{
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
a
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
void
vrelu_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0.
f
?
x
[
i
]
:
0.
f
;
}
}
#if defined __AVX__ || defined __AVX2__
void
vrelu_intri8
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
__m256
tmp
=
_mm256_loadu_ps
(
x
);
tmp
=
_mm256_max_ps
(
tmp
,
_mm256_setzero_ps
());
_mm256_storeu_ps
(
y
,
tmp
);
}
#endif
TEST
(
JitKernel
,
vrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
10.
f
,
1.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VReluKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vrelu_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vrelu_intri8
(
d
,
x_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vaddbias_ref
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
TEST
(
JitKernel
,
vaddbias
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddBiasKernel
<
float
>
>
(
d
);
const
float
a
=
2.
f
;
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vaddbias_ref
(
d
,
a
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
a
,
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vexp_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
#ifdef PADDLE_WITH_MKLML
void
vexp_mkl
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
paddle
::
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
}
#endif
TEST
(
JitKernel
,
vexp
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vexp_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vexp_mkl
(
d
,
x_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
inline
float
_sigmoid
(
float
x
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
float
tmp
=
(
x
<
min
)
?
min
:
((
x
>
max
)
?
max
:
x
);
return
1.
f
/
(
1.
f
+
std
::
exp
(
-
tmp
));
}
void
vsigmoid_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
_sigmoid
(
x
[
i
]);
}
}
void
vsigmoid_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
0.
f
-
y
[
i
];
}
vexp
->
Compute
(
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
}
}
TEST
(
JitKernel
,
vsigmoid
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d
);
const
auto
&
vexp
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vsigmoid_better
(
vexp
,
d
,
x_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vsigmoid_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
inline
float
_tanh
(
float
x
)
{
return
2.
f
*
_sigmoid
(
2.
f
*
x
)
-
1.
f
;
}
void
vtanh_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
_tanh
(
x
[
i
]);
}
}
void
vtanh_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VScalKernel
<
float
>>&
vscal
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VAddBiasKernel
<
float
>>&
vaddbias
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vscal
->
Compute
(
2.
f
,
x
,
y
);
vsigmoid
->
Compute
(
y
,
y
);
vscal
->
Compute
(
2.
f
,
y
);
vaddbias
->
Compute
(
-
1.
f
,
y
,
y
);
}
TEST
(
JitKernel
,
vtanh
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VTanhKernel
<
float
>
>
(
d
);
const
auto
&
vscal
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VScalKernel
<
float
>
>
(
d
);
const
auto
&
vsigmoid
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d
);
const
auto
&
vaddbias
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddBiasKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vtanh_better
(
vscal
,
vsigmoid
,
vaddbias
,
d
,
x_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vtanh_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
lstm_ctht_ref
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid_3d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VTanhKernel
<
float
>>&
vtanh_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp_1
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
const
float
*
i
=
gates
+
d
,
*
f
=
gates
+
d
*
2
,
*
o
=
gates
+
d
*
3
;
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
k
=
0
;
k
<
d
;
++
k
)
{
// C_t = C_t-1 * fgated + cand_gated * igated
ct
[
k
]
=
ct_1
[
k
]
*
f
[
k
]
+
gates
[
k
]
*
i
[
k
];
// H_t = act_cell(C_t) * ogated
float
tmp
=
ct
[
k
]
*
2
;
tmp
=
0.
f
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
vexp_1
->
Compute
(
&
tmp
,
&
tmp
);
tmp
=
2.
f
/
(
1.
f
+
tmp
)
-
1.
f
;
ht
[
k
]
=
tmp
*
o
[
k
];
}
}
void
lstm_ctht_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid_3d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VTanhKernel
<
float
>>&
vtanh_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VMulKernel
<
float
>>&
vmul_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd_d
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
int
d2
=
d
*
2
;
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
);
/* H_t = act_cell(C_t) * ogated */
vtanh_d
->
Compute
(
ct
,
gates
+
d2
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
);
}
TEST
(
JitKernel
,
lstm
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
int
d4
=
d
*
4
;
int
d3
=
d
*
3
;
std
::
vector
<
float
>
x
(
d4
),
xref
(
d4
);
std
::
vector
<
float
>
ct_1
(
d
),
ct_tgt
(
d
),
ht_tgt
(
d
);
std
::
vector
<
float
>
ct_ref
(
d
),
ht_ref
(
d
);
RandomVec
<
float
>
(
d4
,
x
.
data
(),
-
2.
f
,
2.
f
);
RandomVec
<
float
>
(
d
,
ct_1
.
data
(),
-
2.
f
,
2.
f
);
memcpy
(
xref
.
data
(),
x
.
data
(),
sizeof
(
float
)
*
d4
);
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
d
,
false
);
// below kernels are used to compute refer
const
auto
&
vsigmoid_3d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d3
);
const
auto
&
vtanh_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VTanhKernel
<
float
>
>
(
d
);
const
auto
&
vexp_1
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
1
);
const
auto
&
vmul_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
d
);
const
auto
&
vadd_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddKernel
<
float
>
>
(
d
);
float
*
x_data
=
x
.
data
();
float
*
xref_data
=
xref
.
data
();
const
float
*
ct_1_data
=
ct_1
.
data
();
float
*
ct_tgt_data
=
ct_tgt
.
data
();
float
*
ht_tgt_data
=
ht_tgt
.
data
();
float
*
ct_ref_data
=
ct_ref
.
data
();
float
*
ht_ref_data
=
ht_ref
.
data
();
// compute once to check correctness
lstm_ctht_ref
(
vsigmoid_3d
,
vtanh_d
,
vexp_1
,
d
,
xref_data
,
ct_1_data
,
ct_ref_data
,
ht_ref_data
);
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ct_tgt_data
[
i
],
ct_ref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ht_tgt_data
[
i
],
ht_ref_data
[
i
],
1e-3
);
}
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
lstm_ctht_better
(
vsigmoid_3d
,
vtanh_d
,
vmul_d
,
vadd_d
,
d
,
xref_data
,
ct_1_data
,
ct_ref_data
,
ht_ref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
lstm_ctht_ref
(
vsigmoid_3d
,
vtanh_d
,
vexp_1
,
d
,
xref_data
,
ct_1_data
,
ct_ref_data
,
ht_ref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
}
}
void
vscal_ref
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
void
vscal_inp_ref
(
const
int
n
,
const
float
a
,
float
*
x
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
void
vscal_intri8
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
__m256
tmp
;
__m256
scalar
=
_mm256_set1_ps
(
a
);
tmp
=
_mm256_loadu_ps
(
x
);
tmp
=
_mm256_mul_ps
(
tmp
,
scalar
);
_mm256_storeu_ps
(
y
,
tmp
);
}
void
vscal_inp_intri8
(
const
int
n
,
const
float
a
,
float
*
x
)
{
__m256
tmp
;
__m256
scalar
=
_mm256_set1_ps
(
a
);
tmp
=
_mm256_loadu_ps
(
x
);
tmp
=
_mm256_mul_ps
(
tmp
,
scalar
);
_mm256_storeu_ps
(
x
,
tmp
);
}
#endif
#ifdef PADDLE_WITH_MKLML
void
vscal_inp_mkl
(
const
int
n
,
const
float
a
,
float
*
x
)
{
paddle
::
platform
::
dynload
::
cblas_sscal
(
n
,
a
,
x
,
1
);
}
#endif
TEST
(
JitKernel
,
vscal
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
std
::
memcpy
(
y
.
data
(),
x
.
data
(),
sizeof
(
float
)
*
d
);
float
a
=
2.
f
;
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VScalKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_ref
(
d
,
a
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
trefs1
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_ref
(
d
,
a
,
y_data
);
}
auto
trefe1
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_mkl
(
d
,
a
,
y_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_intri8
(
d
,
a
,
x_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
auto
si2
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_intri8
(
d
,
a
,
y_data
);
}
auto
si3
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
a
,
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgts1
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
a
,
y_data
);
}
auto
ttgte1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, inplace takes: "
<<
(
trefe1
-
trefs1
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl inplace takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vmul_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
void
vmul_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
__m256
tmpx
,
tmpy
;
tmpx
=
_mm256_loadu_ps
(
x
);
tmpy
=
_mm256_loadu_ps
(
y
);
tmpx
=
_mm256_mul_ps
(
tmpx
,
tmpy
);
_mm256_storeu_ps
(
z
,
tmpx
);
}
#endif
#ifdef PADDLE_WITH_MKLML
void
vmul_mkl
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
paddle
::
platform
::
dynload
::
vsMul
(
n
,
x
,
y
,
z
);
}
#endif
TEST
(
JitKernel
,
vmul
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
RandomVec
<
float
>
(
d
,
y
.
data
());
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
const
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_mkl
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vadd_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
void
vadd_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
__m256
tmpx
,
tmpy
;
tmpx
=
_mm256_loadu_ps
(
x
);
tmpy
=
_mm256_loadu_ps
(
y
);
tmpx
=
_mm256_add_ps
(
tmpx
,
tmpy
);
_mm256_storeu_ps
(
z
,
tmpx
);
}
#endif
#ifdef PADDLE_WITH_MKLML
void
vadd_mkl
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
paddle
::
platform
::
dynload
::
vsAdd
(
n
,
x
,
y
,
z
);
}
#endif
TEST
(
JitKernel
,
vadd
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
RandomVec
<
float
>
(
d
,
y
.
data
());
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
const
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_mkl
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
TEST
(
JitKernel
,
pool
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
const
int
frame_size
=
4
;
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
const
auto
&
plstm1
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
false
);
const
auto
&
plstm2
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
false
);
const
auto
&
peephole
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
true
);
EXPECT_TRUE
(
plstm1
!=
peephole
);
const
auto
&
pvmul_f
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
4
);
EXPECT_TRUE
(
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
plstm2
)
!=
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_f
));
const
auto
&
pvmul_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
double
>
>
(
4
);
EXPECT_TRUE
(
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_f
)
!=
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_d
));
const
auto
&
pvmul_from_key
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf4"
);
EXPECT_EQ
(
pvmul_f
,
pvmul_from_key
);
const
auto
&
pvmul_from_key2
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf5"
);
EXPECT_TRUE
(
pvmul_from_key2
==
nullptr
);
}
paddle/fluid/platform/cpu_info.cc
浏览文件 @
6447155d
...
@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
...
@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
return
cpu
.
has
(
Cpu
::
tAVX
);
return
cpu
.
has
(
Cpu
::
tAVX
);
case
avx2
:
case
avx2
:
return
cpu
.
has
(
Cpu
::
tAVX2
);
return
cpu
.
has
(
Cpu
::
tAVX2
);
case
avx512
_common
:
case
avx512
f
:
return
cpu
.
has
(
Cpu
::
tAVX512F
);
return
cpu
.
has
(
Cpu
::
tAVX512F
);
case
avx512_core
:
case
avx512_core
:
return
true
&&
cpu
.
has
(
Cpu
::
tAVX512F
)
&&
cpu
.
has
(
Cpu
::
tAVX512BW
)
&&
return
true
&&
cpu
.
has
(
Cpu
::
tAVX512F
)
&&
cpu
.
has
(
Cpu
::
tAVX512BW
)
&&
...
...
paddle/fluid/platform/cpu_info.h
浏览文件 @
6447155d
...
@@ -43,7 +43,7 @@ typedef enum {
...
@@ -43,7 +43,7 @@ typedef enum {
sse42
,
sse42
,
avx
,
avx
,
avx2
,
avx2
,
avx512
_common
,
avx512
f
,
avx512_core
,
avx512_core
,
avx512_core_vnni
,
avx512_core_vnni
,
avx512_mic
,
avx512_mic
,
...
...
paddle/fluid/platform/init.cc
浏览文件 @
6447155d
...
@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
...
@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
#endif
#endif
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx512
_common
))
{
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx512
f
))
{
#ifndef __AVX512F__
#ifndef __AVX512F__
LOG
(
WARNING
)
<<
"AVX512F is available, Please re-compile on local machine"
;
LOG
(
WARNING
)
<<
"AVX512F is available, Please re-compile on local machine"
;
#endif
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录