Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
64a90b2f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
64a90b2f
编写于
12月 17, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
use vadd, vaddrelu, lstm and gru jitkernel
上级
3713d08d
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
68 addition
and
75 deletion
+68
-75
paddle/fluid/operators/fused/fusion_gru_op.cc
paddle/fluid/operators/fused/fusion_gru_op.cc
+30
-28
paddle/fluid/operators/fused/fusion_lstm_op.cc
paddle/fluid/operators/fused/fusion_lstm_op.cc
+32
-30
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+0
-9
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+6
-8
未找到文件。
paddle/fluid/operators/fused/fusion_gru_op.cc
浏览文件 @
64a90b2f
...
...
@@ -15,9 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include <cstring> // for memcpy
#include <string>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
namespace
paddle
{
...
...
@@ -183,27 +183,29 @@ class FusionGRUKernel : public framework::OpKernel<T> {
const int total_T = x_dims[0]; \
const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const math::jitkernel::gru_attr_t attr( \
D, ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("activation")); \
math::jitkernel::gru_t one_step; \
const auto& ker = \
math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::GRUKernel<T>, \
const math::jitkernel::gru_attr_t&>(attr); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
auto place = ctx.GetPlace(); \
#define INIT_OTHER_DEFINES \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const jit::gru_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \
jit::gru_t one_step; \
auto ComputeH1 = \
jit::Get<jit::gruh1, jit::GRUTuples, platform::CPUPlace>(attr); \
auto ComputeHtPart1 = \
jit::Get<jit::gruhtpart1, jit::GRUTuples, platform::CPUPlace>(attr); \
auto ComputeHtPart2 = \
jit::Get<jit::gruhtpart2, jit::GRUTuples, platform::CPUPlace>(attr); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
auto place = ctx.GetPlace(); \
T* xx_data = xx->mutable_data<T>(place)
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
...
...
@@ -242,7 +244,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
}
else
{
one_step
.
gates
=
xx_data
;
one_step
.
ht
=
hidden_out_data
;
ker
->
ComputeH1
(
&
one_step
,
&
attr
);
ComputeH1
(
&
one_step
,
&
attr
);
prev_hidden_data
=
hidden_out_data
;
tstart
=
1
;
move_step
();
...
...
@@ -255,12 +257,12 @@ class FusionGRUKernel : public framework::OpKernel<T> {
one_step
.
gates
=
xx_data
;
one_step
.
ht_1
=
prev_hidden_data
;
one_step
.
ht
=
hidden_out_data
;
ker
->
ComputeHtPart1
(
&
one_step
,
&
attr
);
ComputeHtPart1
(
&
one_step
,
&
attr
);
// gemm rt * Ws
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D
,
D
,
static_cast
<
T
>
(
1
),
hidden_out_data
,
D
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
xx_data
+
D2
,
D3
);
ker
->
ComputeHtPart2
(
&
one_step
,
&
attr
);
ComputeHtPart2
(
&
one_step
,
&
attr
);
// save prev
prev_hidden_data
=
hidden_out_data
;
move_step
();
...
...
@@ -324,7 +326,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
one_step
.
gates
=
cur_in_data
;
one_step
.
ht
=
cur_out_data
;
ker
->
ComputeH1
(
&
one_step
,
&
attr
);
ComputeH1
(
&
one_step
,
&
attr
);
// add offset
cur_in_data
+=
D3
;
cur_out_data
+=
D
;
...
...
@@ -352,7 +354,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
one_step
.
gates
=
cur_batched_data
;
one_step
.
ht_1
=
cur_prev_hidden_data
;
one_step
.
ht
=
cur_out_data
;
ker
->
ComputeHtPart1
(
&
one_step
,
&
attr
);
ComputeHtPart1
(
&
one_step
,
&
attr
);
cur_batched_data
+=
D3
;
cur_prev_hidden_data
+=
D
;
...
...
@@ -370,7 +372,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
one_step
.
gates
=
cur_batched_data
;
one_step
.
ht_1
=
cur_prev_hidden_data
;
one_step
.
ht
=
cur_out_data
;
ker
->
ComputeHtPart2
(
&
one_step
,
&
attr
);
ComputeHtPart2
(
&
one_step
,
&
attr
);
cur_batched_data
+=
D3
;
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
...
...
paddle/fluid/operators/fused/fusion_lstm_op.cc
浏览文件 @
64a90b2f
...
...
@@ -14,9 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
namespace
paddle
{
...
...
@@ -236,31 +236,33 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const int D = wh_dims[0]; \
const int D4 = wh_dims[1]
#define INIT_OTHER_DEFINES \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/
\
const T* wp_data = bias->data<T>() + D4; \
/* for peephole only*/
\
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
} \
const math::jitkernel::lstm_attr_t attr( \
D, ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("candidate_activation"), \
ctx.Attr<std::string>("cell_activation"), use_peepholes); \
math::jitkernel::lstm_t one_step; \
one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \
const auto& ker = \
math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::LSTMKernel<T>, \
const math::jitkernel::lstm_attr_t&>(attr)
#define INIT_OTHER_DEFINES \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/
\
const T* wp_data = bias->data<T>() + D4; \
/* for peephole only*/
\
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
} \
const jit \
: lstm_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")), \
use_peepholes); \
math::jitkernel::lstm_t one_step; \
one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \
auto ComputeC1H1 = \
jit::Get<jit::lstmc1h1, jit::LSTMTuples, platform::CPUPlace>(attr); \
auto ComputeCtHt = \
jit::Get<jit::lstmctht, jit::LSTMTuples, platform::CPUPlace>(attr)
// Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \
...
...
@@ -306,7 +308,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
one_step
.
gates
=
xx_data
;
one_step
.
ct
=
c_out_data
;
one_step
.
ht
=
h_out_data
;
ker
->
ComputeC1H1
(
&
one_step
,
&
attr
);
ComputeC1H1
(
&
one_step
,
&
attr
);
tstart
=
1
;
// move one step
prev_h_data
=
h_out_data
;
...
...
@@ -322,7 +324,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
one_step
.
ct_1
=
prev_c_data
;
one_step
.
ct
=
c_out_data
;
one_step
.
ht
=
h_out_data
;
ker
->
ComputeCtHt
(
&
one_step
,
&
attr
);
ComputeCtHt
(
&
one_step
,
&
attr
);
// move one step
prev_h_data
=
h_out_data
;
prev_c_data
=
c_out_data
;
...
...
@@ -402,7 +404,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
one_step
.
gates
=
cur_in_data
;
one_step
.
ct
=
cur_c_out_data
;
one_step
.
ht
=
cur_h_out_data
;
ker
->
ComputeC1H1
(
&
one_step
,
&
attr
);
ComputeC1H1
(
&
one_step
,
&
attr
);
cur_in_data
+=
D4
;
cur_c_out_data
+=
D
;
...
...
@@ -432,7 +434,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
one_step
.
ct_1
=
cur_prev_c_data
;
one_step
.
ct
=
cur_c_out_data
;
one_step
.
ht
=
cur_h_out_data
;
ker
->
ComputeCtHt
(
&
one_step
,
&
attr
);
ComputeC1H1
(
&
one_step
,
&
attr
);
// move one batch
cur_in_data
+=
D4
;
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
64a90b2f
...
...
@@ -73,12 +73,3 @@ if(WITH_GPU)
endif
()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_and_split
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
# set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
# set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
# if(WITH_XBYAK)
# list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
# list(APPEND JIT_KERNEL_DEPS xbyak)
# endif()
# cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
# cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
64a90b2f
...
...
@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -30,22 +30,20 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return
;
}
if
(
relu
)
{
const
auto
&
vaddrelu
=
jitkernel
::
KernelPool
::
Instance
()
.
template
Get
<
jitkernel
::
VAddReluKernel
<
T
>
>
(
N
);
auto
compute
=
jit
::
Get
<
jit
::
vaddrelu
,
jit
::
XYZNTuples
,
platform
::
CPUPlcace
>
(
N
);
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
vaddrelu
->
C
ompute
(
B
,
dst
,
dst
,
N
);
c
ompute
(
B
,
dst
,
dst
,
N
);
}
}
else
{
const
auto
&
vadd
=
jitkernel
::
KernelPool
::
Instance
()
.
template
Get
<
jitkernel
::
VAddKernel
<
T
>
>
(
N
);
auto
compute
=
jit
::
Get
<
jit
::
vadd
,
jit
::
XYZNTuples
,
platform
::
CPUPlcace
>
(
N
);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
vadd
->
C
ompute
(
B
,
dst
,
dst
,
N
);
c
ompute
(
B
,
dst
,
dst
,
N
);
}
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录