Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
664159ad
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
664159ad
编写于
10月 22, 2018
作者:
T
tensor-tang
提交者:
GitHub
10月 22, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #13998 from tensor-tang/fea/fusion_seqconv_add
Fea/fusion seqconv eltadd relu
上级
765085d2
40f8456a
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
819 addition
and
72 deletion
+819
-72
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+45
-0
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+34
-0
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+101
-0
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+38
-0
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+12
-11
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+7
-1
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+229
-0
paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
+42
-0
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+15
-9
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+6
-0
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+88
-0
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+57
-0
python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
...uid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
+94
-0
python/paddle/fluid/tests/unittests/test_seq_conv.py
python/paddle/fluid/tests/unittests/test_seq_conv.py
+49
-50
未找到文件。
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
664159ad
...
@@ -37,6 +37,7 @@ pass_library(embedding_fc_lstm_fuse_pass inference)
...
@@ -37,6 +37,7 @@ pass_library(embedding_fc_lstm_fuse_pass inference)
pass_library
(
fc_gru_fuse_pass inference
)
pass_library
(
fc_gru_fuse_pass inference
)
pass_library
(
seq_concat_fc_fuse_pass inference
)
pass_library
(
seq_concat_fc_fuse_pass inference
)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
conv_bias_mkldnn_fuse_pass inference
)
pass_library
(
conv_bias_mkldnn_fuse_pass inference
)
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
664159ad
...
@@ -761,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()(
...
@@ -761,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()(
return
relu_out_var
;
return
relu_out_var
;
}
}
PDNode
*
patterns
::
SeqConvEltAddRelu
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
seqconv_input
)
{
// Create Operators
seqconv_input
->
assert_is_op_input
(
"sequence_conv"
,
"X"
);
auto
*
seqconv_op
=
pattern
->
NewNode
(
seqconv_repr
())
->
assert_is_op
(
"sequence_conv"
)
->
assert_op_attr
<
bool
>
(
"paddingTrainable"
,
false
)
->
assert_op_attr
<
int
>
(
"contextStride"
,
1
);
auto
*
eltadd_op
=
pattern
->
NewNode
(
eltadd_repr
())
->
assert_is_op
(
"elementwise_add"
);
auto
*
relu_op
=
pattern
->
NewNode
(
relu_repr
())
->
assert_is_op
(
"relu"
);
// Create variables
// Filter
auto
*
seqconv_weight_var
=
pattern
->
NewNode
(
seqconv_weight_repr
())
->
AsInput
()
->
assert_is_persistable_var
()
->
assert_is_op_input
(
"sequence_conv"
,
"Filter"
);
// Bias
auto
*
eltadd_bias_var
=
pattern
->
NewNode
(
eltadd_bias_repr
())
->
AsInput
()
->
assert_is_op_input
(
"elementwise_add"
);
// intermediate variable, will be removed in the IR after fuse.
auto
*
seqconv_out_var
=
pattern
->
NewNode
(
seqconv_out_repr
())
->
AsIntermediate
()
->
assert_is_only_output_of_op
(
"sequence_conv"
)
->
assert_is_op_input
(
"elementwise_add"
);
auto
*
eltadd_out_var
=
pattern
->
NewNode
(
eltadd_out_repr
())
->
AsIntermediate
()
->
assert_is_only_output_of_op
(
"elementwise_add"
)
->
assert_is_only_input_of_op
(
"relu"
);
// output
auto
*
relu_out_var
=
pattern
->
NewNode
(
relu_out_repr
())
->
AsOutput
()
->
assert_is_op_output
(
"relu"
);
seqconv_op
->
LinksFrom
({
seqconv_input
,
seqconv_weight_var
})
.
LinksTo
({
seqconv_out_var
});
eltadd_op
->
LinksFrom
({
seqconv_out_var
,
eltadd_bias_var
})
.
LinksTo
({
eltadd_out_var
});
relu_op
->
LinksFrom
({
eltadd_out_var
}).
LinksTo
({
relu_out_var
});
return
relu_out_var
;
}
PDNode
*
patterns
::
FC
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
x
,
PDNode
*
patterns
::
FC
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
x
,
bool
with_bias
)
{
bool
with_bias
)
{
// Create shared nodes.
// Create shared nodes.
...
...
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
664159ad
...
@@ -128,6 +128,15 @@ struct PDNode {
...
@@ -128,6 +128,15 @@ struct PDNode {
const
std
::
unordered_set
<
std
::
string
>&
op_types
,
const
std
::
unordered_set
<
std
::
string
>&
op_types
,
const
std
::
string
&
argument
,
int
nth
);
const
std
::
string
&
argument
,
int
nth
);
template
<
typename
T
>
PDNode
*
assert_op_attr
(
const
std
::
string
&
attr_name
,
const
T
&
attr
)
{
asserts_
.
emplace_back
([
=
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
HasAttr
(
attr_name
)
&&
boost
::
get
<
T
>
(
x
->
Op
()
->
GetAttr
(
attr_name
))
==
attr
;
});
return
this
;
}
private:
private:
PDNode
(
PDPattern
*
pattern
,
const
std
::
string
&
name
=
""
,
PDNode
(
PDPattern
*
pattern
,
const
std
::
string
&
name
=
""
,
Type
type
=
Type
::
kVar
)
Type
type
=
Type
::
kVar
)
...
@@ -434,6 +443,31 @@ struct ConvReLU : public PatternBase {
...
@@ -434,6 +443,31 @@ struct ConvReLU : public PatternBase {
PATTERN_DECL_NODE
(
relu_out
);
PATTERN_DECL_NODE
(
relu_out
);
};
};
// SEQCONV with Elementwise_Add ReLU
// op: seqconv + elementwise_add + relu
// named nodes:
// seqconv_input, seqconv_weight,
// seqconv_out, seqconv,
// elementwise_add_bias, elementwise_add_out, elementwise_add
// relu_out, relu
struct
SeqConvEltAddRelu
:
public
PatternBase
{
SeqConvEltAddRelu
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"seqconv_eltadd_relu"
)
{}
PDNode
*
operator
()(
PDNode
*
seqconv_input
);
// declare operator node's name
PATTERN_DECL_NODE
(
seqconv
);
PATTERN_DECL_NODE
(
eltadd
);
PATTERN_DECL_NODE
(
relu
);
// declare variable node's name
PATTERN_DECL_NODE
(
seqconv_weight
);
PATTERN_DECL_NODE
(
seqconv_out
);
PATTERN_DECL_NODE
(
eltadd_bias
);
PATTERN_DECL_NODE
(
eltadd_out
);
PATTERN_DECL_NODE
(
relu_out
);
};
// FC with bias
// FC with bias
// op: mul + elementwise_add
// op: mul + elementwise_add
// named nodes:
// named nodes:
...
...
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
0 → 100644
浏览文件 @
664159ad
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
int
BuildFusion
(
Graph
*
graph
,
const
std
::
string
&
name_scope
,
Scope
*
scope
)
{
GraphPatternDetector
gpd
;
auto
*
pattern
=
gpd
.
mutable_pattern
();
PDNode
*
x
=
pattern
->
NewNode
(
patterns
::
PDNodeName
(
name_scope
,
"X"
))
->
assert_is_op_input
(
"sequence_conv"
)
->
assert_var_not_persistable
();
patterns
::
SeqConvEltAddRelu
fuse_pattern
(
pattern
,
name_scope
);
fuse_pattern
(
x
);
// Create New OpDesc
auto
fuse_creator
=
[
&
](
Node
*
seqconv
,
Node
*
input
,
Node
*
seqconv_weight
,
Node
*
eltadd_bias
,
Node
*
relu_out
)
{
OpDesc
op_desc
;
op_desc
.
SetType
(
"fusion_seqconv_eltadd_relu"
);
op_desc
.
SetInput
(
"X"
,
{
input
->
Name
()});
op_desc
.
SetInput
(
"Filter"
,
{
seqconv_weight
->
Name
()});
op_desc
.
SetInput
(
"Bias"
,
{
eltadd_bias
->
Name
()});
op_desc
.
SetAttr
(
"contextLength"
,
seqconv
->
Op
()
->
GetAttr
(
"contextLength"
));
op_desc
.
SetAttr
(
"contextStart"
,
seqconv
->
Op
()
->
GetAttr
(
"contextStart"
));
op_desc
.
SetAttr
(
"contextStride"
,
seqconv
->
Op
()
->
GetAttr
(
"contextStride"
));
PADDLE_ENFORCE
(
graph
->
Has
(
kParamScopeAttr
));
auto
*
scope
=
graph
->
Get
<
Scope
*>
(
kParamScopeAttr
);
const
std
::
string
ColMat
=
patterns
::
UniqueKey
(
"SeqConvColMat"
);
op_desc
.
SetOutput
(
"ColMat"
,
{
ColMat
});
op_desc
.
SetOutput
(
"Out"
,
{
relu_out
->
Name
()});
scope
->
Var
(
ColMat
)
->
GetMutable
<
LoDTensor
>
();
auto
*
op
=
graph
->
CreateOpNode
(
&
op_desc
);
IR_NODE_LINK_TO
(
input
,
op
);
IR_NODE_LINK_TO
(
seqconv_weight
,
op
);
IR_NODE_LINK_TO
(
eltadd_bias
,
op
);
IR_NODE_LINK_TO
(
op
,
relu_out
);
return
op
;
};
int
fusion_count
{
0
};
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle SeqConv EltAdd Relu fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv
,
seqconv
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv_weight
,
seqconv_weight
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv_out
,
seqconv_out
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
eltadd
,
eltadd
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
eltadd_bias
,
eltadd_bias
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
eltadd_out
,
eltadd_out
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
relu
,
relu
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
relu_out
,
relu_out
,
fuse_pattern
);
fuse_creator
(
seqconv
,
subgraph
.
at
(
x
),
seqconv_weight
,
eltadd_bias
,
relu_out
);
std
::
unordered_set
<
const
Node
*>
marked_nodes
(
{
seqconv
,
seqconv_out
,
eltadd
,
eltadd_out
,
relu
});
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
++
fusion_count
;
};
gpd
(
graph
,
handler
);
return
fusion_count
;
}
std
::
unique_ptr
<
ir
::
Graph
>
SeqConvEltAddReluFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
FusePassBase
::
Init
(
name_scope_
,
graph
.
get
());
int
fusion_count
=
BuildFusion
(
graph
.
get
(),
name_scope_
,
param_scope
());
AddStatis
(
fusion_count
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
seqconv_eltadd_relu_fuse_pass
,
paddle
::
framework
::
ir
::
SeqConvEltAddReluFusePass
);
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
0 → 100644
浏览文件 @
664159ad
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
SeqConvEltAddReluFusePass
:
public
FusePassBase
{
public:
virtual
~
SeqConvEltAddReluFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
const
std
::
string
name_scope_
{
"seqconv_eltadd_relu_fuse"
};
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
664159ad
...
@@ -67,17 +67,18 @@ class Analyzer : public OrderedRegistry<PassManager> {
...
@@ -67,17 +67,18 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion.
// larger fusion.
const
std
::
vector
<
std
::
string
>
all_ir_passes_
{{
const
std
::
vector
<
std
::
string
>
all_ir_passes_
{{
// Manual update the passes here.
// Manual update the passes here.
"infer_clean_graph_pass"
,
//
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"embedding_fc_lstm_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
"fc_lstm_fuse_pass"
,
//
"embedding_fc_lstm_fuse_pass"
,
//
"mul_lstm_fuse_pass"
,
//
"fc_lstm_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"mul_lstm_fuse_pass"
,
//
"mul_gru_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"seq_concat_fc_fuse_pass"
,
//
"mul_gru_fuse_pass"
,
//
"fc_fuse_pass"
,
//
"seq_concat_fc_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"fc_fuse_pass"
,
//
"conv_eltwiseadd_bn_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_eltwiseadd_bn_fuse_pass"
,
//
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
"conv_bias_mkldnn_fuse_pass"
,
//
"conv_bias_mkldnn_fuse_pass"
,
//
"conv_relu_mkldnn_fuse_pass"
,
//
"conv_relu_mkldnn_fuse_pass"
,
//
...
...
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
浏览文件 @
664159ad
...
@@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
...
@@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
int
num_ops
;
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqconv_eltadd_relu_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"seqconv_eltadd_relu_fuse"
),
6
);
EXPECT_EQ
(
num_ops
,
32
);
}
}
// Compare result of NativeConfig and AnalysisConfig
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
664159ad
...
@@ -86,7 +86,7 @@ function(op_library TARGET)
...
@@ -86,7 +86,7 @@ function(op_library TARGET)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
"hierarchical_sigmoid_op"
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
"hierarchical_sigmoid_op"
"crf_decoding_op"
"select_op"
"lstmp_op"
"gru_op"
"fusion_gru_op"
"lstm_op"
"fusion_lstm_op"
"cumsum_op"
"crf_decoding_op"
"select_op"
"lstmp_op"
"gru_op"
"fusion_gru_op"
"lstm_op"
"fusion_lstm_op"
"cumsum_op"
"channel_send_op"
"channel_create_op"
"channel_close_op"
"channel_recv_op"
)
"fusion_seqconv_eltadd_relu_op"
"channel_send_op"
"channel_create_op"
"channel_close_op"
"channel_recv_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
windows_unsupport_op
}
"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
windows_unsupport_op
}
"
)
return
()
return
()
endif
()
endif
()
...
...
paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
0 → 100644
浏览文件 @
664159ad
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h"
#include <algorithm> // for min, max
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc_compute.h"
namespace
paddle
{
namespace
operators
{
void
FusionSeqConvEltAddReluOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of FusionSeqConvEltAddReluOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Filter"
),
"Input(Filter) of FusionSeqConvEltAddReluOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Bias"
),
"Input(Bias) of FusionSeqConvEltAddReluOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FusionSeqConvEltAddReluOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ColMat"
),
"Output(ColMat) of FusionSeqConvEltAddReluOp should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
w_dims
=
ctx
->
GetInputDim
(
"Filter"
);
int
context_length
=
ctx
->
Attrs
().
Get
<
int
>
(
"contextLength"
);
PADDLE_ENFORCE
(
ctx
->
Attrs
().
Get
<
int
>
(
"contextStride"
)
==
1
,
"Currently, FusionSeqConvEltAddReluOp only supports contextStride=1."
);
PADDLE_ENFORCE
(
x_dims
.
size
()
==
2
&&
w_dims
.
size
()
==
2
,
"Input(X, Filter) should be 2-D tensor."
);
PADDLE_ENFORCE
(
x_dims
.
size
()
==
2
&&
w_dims
.
size
()
==
2
,
"Input(X, Filter) should be 2-D tensor."
);
PADDLE_ENFORCE
(
w_dims
[
0
]
==
context_length
*
x_dims
[
1
],
"Filter's height should be context_length * "
"input_hidden_size ."
);
PADDLE_ENFORCE_GT
(
context_length
+
ctx
->
Attrs
().
Get
<
int
>
(
"contextStart"
),
0
,
"contextStart size should be smaller than contextLength."
);
ctx
->
SetOutputDim
(
"Out"
,
{
x_dims
[
0
],
w_dims
[
1
]});
ctx
->
SetOutputDim
(
"ColMat"
,
{
x_dims
[
0
],
w_dims
[
0
]});
ctx
->
ShareLoD
(
"X"
,
"Out"
);
}
framework
::
OpKernelType
FusionSeqConvEltAddReluOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
)
->
type
()),
ctx
.
device_context
());
}
void
FusionSeqConvEltAddReluOpMaker
::
Make
()
{
AddInput
(
"X"
,
"(LoDTensor) the input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X M), where T is the "
"total time steps in this mini-batch, M is the dim size of x."
);
// PaddingData only support false yet, should be ensured at pass.
AddInput
(
"Filter"
,
"(Tensor) same as the input(Filter) of sequence conv op is an "
"learnable parameter."
"This is a tensor with shape (K, N), where K is the "
"context_length * dim size of x, N is the output feature size."
);
AddInput
(
"Bias"
,
"(Tensor) the learnable weights. shape (1, N), where N is the "
"output feature size"
);
AddOutput
(
"Out"
,
"(LoDTensor) the output(Out) is a LodTensor, which support "
"variable-time length output sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T, N), where, T is the "
"total time steps in this mini-batch, N is the output feature size."
);
AddOutput
(
"ColMat"
,
"(Tensor) (T, K), where T is where T is the "
"total time steps in this mini-batch, K is height of Filter"
)
.
AsIntermediate
();
AddAttr
<
int
>
(
"contextLength"
,
"(int) the contextLength of FusionSeqConvEltAddReluOp is the "
"height of the convolution kernel."
)
.
GreaterThan
(
0
);
AddAttr
<
int
>
(
"contextStart"
,
"(int, default:0) the contextStart of FusionSeqConvEltAddReluOp "
"represents the beginning of the convolution of the number of "
"rows of sequence, which can be negative. The negative number "
"means to pad contextStart time-steps of zeros or learnable "
"parameters at the beginning of each instance. The positive "
"number means to skip contextStart time-steps of each "
"instance."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"contextStride"
,
"(int, default:1) the contextStride of FusionSeqConvEltAddReluOp "
"represents the stride length of convolution kernel. "
"Currently, FusionSeqConvEltAddReluOp only supports"
"contextStride=1."
)
.
SetDefault
(
1
)
.
GreaterThan
(
0
);
AddComment
(
R"DOC(
Fusion Sequence Conv and ElementwiseAdd Operator.
)DOC"
);
}
template
<
typename
T
>
class
FusionSeqConvEltAddReluKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
w
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
b
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
y
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
col
=
ctx
.
Output
<
Tensor
>
(
"ColMat"
);
auto
x_lod
=
x
->
lod
();
auto
x_dims
=
x
->
dims
();
auto
w_dims
=
w
->
dims
();
PADDLE_ENFORCE_EQ
(
b
->
numel
(),
w_dims
[
1
],
"bias size should be equal to output feature size."
);
PADDLE_ENFORCE_EQ
(
x_lod
.
size
(),
1UL
,
"Only support one level sequence now."
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
const
T
*
b_data
=
b
->
data
<
T
>
();
T
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
col_data
=
col
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
context_start
=
ctx
.
Attr
<
int
>
(
"contextStart"
);
int
context_length
=
ctx
.
Attr
<
int
>
(
"contextLength"
);
int
up_pad
=
std
::
max
(
0
,
-
context_start
);
int
down_pad
=
std
::
max
(
0
,
context_start
+
context_length
-
1
);
// im2col
int
src_mat_w
=
static_cast
<
int
>
(
x_dims
[
1
]);
int
src_mat_w_sz
=
src_mat_w
*
sizeof
(
T
);
int
col_mat_w
=
static_cast
<
int
>
(
w_dims
[
0
]);
int
col_mat_w_sz
=
col_mat_w
*
sizeof
(
T
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
x_lod
[
0
].
size
())
-
1
;
++
i
)
{
int
st
=
x_lod
[
0
][
i
];
int
ed
=
x_lod
[
0
][
i
+
1
];
const
T
*
src_data
=
x_data
+
st
*
src_mat_w
;
T
*
dst_data
=
col_data
+
st
*
col_mat_w
;
int
seq_len
=
ed
-
st
;
if
(
seq_len
>
up_pad
+
down_pad
)
{
// zero all up_pad and fill data
std
::
memset
(
dst_data
,
0
,
up_pad
*
col_mat_w_sz
);
dst_data
=
dst_data
+
up_pad
*
src_mat_w
;
int
copy_size
=
col_mat_w_sz
-
up_pad
*
src_mat_w_sz
;
for
(
int
j
=
0
;
j
<
up_pad
;
++
j
)
{
// blas.VCOPY?
std
::
memcpy
(
dst_data
,
src_data
,
copy_size
);
dst_data
+=
(
col_mat_w
-
src_mat_w
);
copy_size
+=
src_mat_w_sz
;
}
// fill data
for
(
int
j
=
0
;
j
<
seq_len
-
up_pad
-
down_pad
;
++
j
)
{
std
::
memcpy
(
dst_data
,
src_data
,
copy_size
);
dst_data
+=
col_mat_w
;
src_data
+=
src_mat_w
;
}
// zero all down_pad and fill data
std
::
memset
(
dst_data
,
0
,
down_pad
*
col_mat_w_sz
);
copy_size
-=
src_mat_w_sz
;
for
(
int
j
=
0
;
j
<
down_pad
;
++
j
)
{
std
::
memcpy
(
dst_data
,
src_data
,
copy_size
);
dst_data
+=
col_mat_w
;
src_data
+=
src_mat_w
;
copy_size
-=
src_mat_w_sz
;
}
}
else
{
PADDLE_ENFORCE_GE
(
context_length
,
up_pad
+
down_pad
+
1
);
std
::
memset
(
dst_data
,
0
,
seq_len
*
col_mat_w_sz
);
dst_data
=
dst_data
+
up_pad
*
src_mat_w
;
int
zero_sz
=
up_pad
*
src_mat_w_sz
;
int
cur_src_sz
=
seq_len
*
src_mat_w_sz
;
for
(
int
j
=
0
;
j
<
std
::
min
(
up_pad
,
seq_len
);
++
j
)
{
int
copy_size
=
std
::
min
(
cur_src_sz
,
col_mat_w_sz
-
zero_sz
);
std
::
memcpy
(
dst_data
,
src_data
,
copy_size
);
dst_data
+=
(
col_mat_w
-
src_mat_w
);
zero_sz
-=
src_mat_w_sz
;
}
// from bottom
dst_data
=
col_data
+
ed
*
col_mat_w
;
src_data
=
x_data
+
st
*
src_mat_w
;
zero_sz
=
down_pad
*
src_mat_w_sz
;
for
(
int
j
=
1
;
j
<=
std
::
min
(
down_pad
,
seq_len
);
++
j
)
{
int
copy_size
=
std
::
min
(
cur_src_sz
,
col_mat_w_sz
-
zero_sz
);
std
::
memcpy
(
dst_data
-
(
zero_sz
+
copy_size
)
/
sizeof
(
T
),
src_data
+
std
::
max
(
seq_len
-
j
-
up_pad
,
0
)
*
src_mat_w
,
copy_size
);
dst_data
-=
col_mat_w
;
zero_sz
-=
src_mat_w_sz
;
}
}
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
w_dims
[
1
],
w_dims
[
0
],
col_data
,
w_data
,
y_data
,
b_data
,
true
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_seqconv_eltadd_relu
,
ops
::
FusionSeqConvEltAddReluOp
,
ops
::
FusionSeqConvEltAddReluOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
fusion_seqconv_eltadd_relu
,
ops
::
FusionSeqConvEltAddReluKernel
<
float
>
,
ops
::
FusionSeqConvEltAddReluKernel
<
double
>
);
paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
0 → 100644
浏览文件 @
664159ad
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
class
FusionSeqConvEltAddReluOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
FusionSeqConvEltAddReluOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
664159ad
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
DECLARE_int32
(
paddle_num_threads
);
DECLARE_int32
(
paddle_num_threads
);
...
@@ -30,20 +31,25 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
...
@@ -30,20 +31,25 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
if
(
B
==
NULL
)
{
if
(
B
==
NULL
)
{
return
;
return
;
}
}
if
(
relu
)
{
const
auto
&
vaddrelu
=
jitkernel
::
KernelPool
::
Instance
()
.
template
Get
<
jitkernel
::
VAddReluKernel
<
T
>
>
(
N
);
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
vaddrelu
->
Compute
(
B
,
dst
,
dst
);
}
}
else
{
const
auto
&
vadd
=
jitkernel
::
KernelPool
::
Instance
()
.
template
Get
<
jitkernel
::
VAddKernel
<
T
>
>
(
N
);
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
blas
.
AXPY
(
N
,
static_cast
<
T
>
(
1
),
B
,
Y
+
i
*
N
);
T
*
dst
=
Y
+
i
*
N
;
vadd
->
Compute
(
B
,
dst
,
dst
);
}
}
}
if
(
!
relu
)
{
return
;
}
// TODO(TJ): fuse relu
LOG
(
FATAL
)
<<
"Not implemented!"
;
}
}
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
664159ad
...
@@ -86,6 +86,12 @@ class VAddBiasKernel : public Kernel {
...
@@ -86,6 +86,12 @@ class VAddBiasKernel : public Kernel {
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
class
VAddReluKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
template
<
typename
T
>
class
VActKernel
:
public
Kernel
{
class
VActKernel
:
public
Kernel
{
public:
public:
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
664159ad
...
@@ -378,11 +378,99 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
...
@@ -378,11 +378,99 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
};
};
/* VAddRelu JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddReluKernelImpl
:
public
VAddReluKernel
<
T
>
{
public:
explicit
VAddReluKernelImpl
(
int
d
)
:
VAddReluKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx = _mm256_loadu_ps(x); \
__m256 tmpy = _mm256_loadu_ps(y); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps()); \
_mm256_storeu_ps(z, tmpy); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ16>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(y); \
tmp0 = _mm256_add_ps(tmp0, tmp1); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp2 = _mm256_loadu_ps(y + 8); \
tmp1 = _mm256_add_ps(tmp1, tmp2); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(z, tmp0); \
_mm256_storeu_ps(z + 8, tmp1); \
}
#define INTRI_COMMON_FLOAT(isa, block) \
template <> \
VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
: VAddReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
} \
template <> \
void VAddReluKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmpx = _mm256_loadu_ps(x + i); \
__m256 tmpy = _mm256_loadu_ps(y + i); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, zeros); \
_mm256_storeu_ps(z + i, tmpy); \
} \
for (int i = this->end_; i < this->num_; ++i) { \
z[i] = x[i] + y[i]; \
z[i] = z[i] > 0 ? z[i] : 0; \
} \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_COMMON_FLOAT
(
jit
::
avx
,
kGT16
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
INTRI_COMMON_FLOAT
(
jit
::
avx2
,
kGT16
);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
INTRI_COMMON_FLOAT
(
jit
::
avx512f
,
kGT16
);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_COMMON_FLOAT
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL
(
vaddrelu
,
VAddReluKernel
);
REGISTER_JITKERNEL
(
videntity
,
VIdentityKernel
);
REGISTER_JITKERNEL
(
videntity
,
VIdentityKernel
);
}
// namespace jitkernel
}
// namespace jitkernel
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
664159ad
...
@@ -712,6 +712,63 @@ TEST(JitKernel, vadd) {
...
@@ -712,6 +712,63 @@ TEST(JitKernel, vadd) {
}
}
}
}
void
vaddrelu_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
void
vaddrelu_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VReluKernel
<
float
>>&
vrelu
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
vadd
->
Compute
(
x
,
y
,
z
);
vrelu
->
Compute
(
z
,
z
);
}
TEST
(
JitKernel
,
vaddrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
RandomVec
<
float
>
(
d
,
y
.
data
());
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddReluKernel
<
float
>
>
(
d
);
const
auto
&
vadd
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddKernel
<
float
>
>
(
d
);
const
auto
&
vrelu
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VReluKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
const
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vaddrelu_better
(
vadd
,
vrelu
,
x_data
,
y_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
TEST
(
JitKernel
,
pool
)
{
TEST
(
JitKernel
,
pool
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
const
int
frame_size
=
4
;
const
int
frame_size
=
4
;
...
...
python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
0 → 100644
浏览文件 @
664159ad
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
random
from
op_test
import
OpTest
from
test_seq_conv
import
seqconv
class
TestSeqConvEltAddRelu
(
OpTest
):
def
set_conf
(
self
):
pass
def
setUp
(
self
):
self
.
op_type
=
'fusion_seqconv_eltadd_relu'
self
.
lod
=
[[
6
,
4
]]
self
.
in_fea_size
=
16
self
.
out_fea_size
=
8
self
.
context_length
=
4
self
.
context_stride
=
1
self
.
context_start
=
0
self
.
set_conf
()
assert
self
.
context_stride
==
1
T
=
sum
(
self
.
lod
[
0
])
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[
T
,
self
.
in_fea_size
]).
astype
(
'float32'
)
w
=
np
.
random
.
uniform
(
-
1
,
1
,
[
self
.
in_fea_size
*
self
.
context_length
,
self
.
out_fea_size
]).
astype
(
'float32'
)
b
=
np
.
random
.
uniform
(
-
2
,
1
,
[
1
,
self
.
out_fea_size
]).
astype
(
'float32'
)
out
=
seqconv
(
x
,
self
.
lod
,
w
,
self
.
context_length
,
self
.
context_start
)
out
=
np
.
maximum
(
out
+
b
,
0
)
self
.
inputs
=
{
'X'
:
(
x
,
self
.
lod
),
'Filter'
:
w
,
'Bias'
:
b
}
self
.
attrs
=
{
'contextStart'
:
self
.
context_start
,
'contextLength'
:
self
.
context_length
,
'contextStride'
:
self
.
context_stride
}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestSeqConvEltAddReluBS1
(
TestSeqConvEltAddRelu
):
def
set_conf
(
self
):
self
.
lod
=
[[
10
]]
class
TestSeqConvEltAddReluBS1Case2
(
TestSeqConvEltAddRelu
):
def
set_conf
(
self
):
self
.
lod
=
[[
2
]]
class
TestSeqConvEltAddReluCase1
(
TestSeqConvEltAddRelu
):
def
set_conf
(
self
):
self
.
lod
=
[[
3
,
5
,
1
,
6
]]
self
.
context_length
=
3
self
.
context_start
=
-
2
class
TestSeqConvEltAddReluCase2
(
TestSeqConvEltAddRelu
):
def
set_conf
(
self
):
self
.
lod
=
[[
10
,
1
,
2
,
4
,
1
,
5
,
6
]]
self
.
in_fea_size
=
2
self
.
context_length
=
4
self
.
context_start
=
-
1
class
TestSeqConvEltAddReluCase3
(
TestSeqConvEltAddRelu
):
def
set_conf
(
self
):
self
.
lod
=
[[
10
,
1
,
2
,
4
,
1
,
5
,
6
]]
self
.
context_length
=
5
self
.
context_start
=
-
4
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_seq_conv.py
浏览文件 @
664159ad
...
@@ -20,6 +20,53 @@ import random
...
@@ -20,6 +20,53 @@ import random
from
op_test
import
OpTest
from
op_test
import
OpTest
def
seqconv
(
x
,
lod
,
filter
,
context_length
,
context_start
,
padding_trainable
=
False
,
padding_data
=
None
):
[
T
,
M
]
=
x
.
shape
col
=
np
.
zeros
((
T
,
context_length
*
M
)).
astype
(
'float32'
)
offset
=
[
0
]
for
seq_len
in
lod
[
0
]:
offset
.
append
(
offset
[
-
1
]
+
seq_len
)
begin_pad
=
np
.
max
([
0
,
-
context_start
])
for
i
in
range
(
len
(
offset
)
-
1
):
for
j
in
range
(
context_length
):
in_begin
=
offset
[
i
]
+
context_start
+
j
in_end
=
offset
[
i
+
1
]
+
context_start
+
j
out_begin
=
offset
[
i
]
out_end
=
offset
[
i
+
1
]
if
in_begin
<
offset
[
i
]:
pad_size
=
np
.
min
(
[
offset
[
i
]
-
in_begin
,
offset
[
i
+
1
]
-
offset
[
i
]])
if
padding_trainable
:
sub_w
=
padding_data
[
j
:
j
+
pad_size
,
:]
col
[
offset
[
i
]:
offset
[
i
]
+
pad_size
,
j
*
M
:(
j
+
1
)
*
M
]
=
sub_w
out_begin
=
offset
[
i
]
+
pad_size
in_begin
=
offset
[
i
]
if
in_end
>
offset
[
i
+
1
]:
pad_size
=
np
.
min
(
[
in_end
-
offset
[
i
+
1
],
offset
[
i
+
1
]
-
offset
[
i
]])
if
padding_trainable
:
sub_w
=
padding_data
[
begin_pad
+
context_start
+
j
-
pad_size
:
begin_pad
+
context_start
+
j
,
:]
col
[
offset
[
i
+
1
]
-
pad_size
:
offset
[
i
+
1
],
j
*
M
:(
j
+
1
)
*
M
]
=
sub_w
in_end
=
offset
[
i
+
1
]
out_end
=
offset
[
i
+
1
]
-
pad_size
if
in_end
<=
in_begin
:
continue
in_sub
=
x
[
in_begin
:
in_end
,
:]
col
[
out_begin
:
out_end
,
j
*
M
:(
j
+
1
)
*
M
]
+=
in_sub
return
np
.
dot
(
col
,
filter
)
class
TestSeqProject
(
OpTest
):
class
TestSeqProject
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
init_test_case
()
self
.
init_test_case
()
...
@@ -66,57 +113,9 @@ class TestSeqProject(OpTest):
...
@@ -66,57 +113,9 @@ class TestSeqProject(OpTest):
'paddingTrainable'
:
self
.
padding_trainable
,
'paddingTrainable'
:
self
.
padding_trainable
,
'contextStride'
:
self
.
context_stride
'contextStride'
:
self
.
context_stride
}
}
out
=
np
.
zeros
(
out
=
seqconv
(
x
,
self
.
lod
,
w
,
self
.
context_length
,
self
.
context_start
,
(
self
.
input_size
[
0
],
self
.
output_represention
)).
astype
(
'float32'
)
self
.
padding_trainable
,
self
.
pad_data
)
self
.
outputs
=
{
'Out'
:
out
}
self
.
outputs
=
{
'Out'
:
out
}
self
.
compute
()
def
compute
(
self
):
x
,
lod
=
self
.
inputs
[
'X'
]
filter
=
self
.
inputs
[
'Filter'
]
pading_data
=
self
.
pad_data
out
=
np
.
zeros
((
self
.
input_size
[
0
],
self
.
context_length
*
self
.
input_size
[
1
])).
astype
(
'float32'
)
offset
=
[
0
]
for
seq_len
in
lod
[
0
]:
offset
.
append
(
offset
[
-
1
]
+
seq_len
)
begin_pad
=
np
.
max
([
0
,
-
self
.
context_start
])
for
i
in
range
(
len
(
offset
)
-
1
):
for
j
in
range
(
self
.
context_length
):
in_begin
=
offset
[
i
]
+
self
.
context_start
+
j
in_end
=
offset
[
i
+
1
]
+
self
.
context_start
+
j
out_begin
=
offset
[
i
]
out_end
=
offset
[
i
+
1
]
if
in_begin
<
offset
[
i
]:
pad_size
=
np
.
min
(
[
offset
[
i
]
-
in_begin
,
offset
[
i
+
1
]
-
offset
[
i
]])
if
self
.
padding_trainable
:
sub_w
=
pading_data
[
j
:
j
+
pad_size
,
:]
out
[
offset
[
i
]:
offset
[
i
]
+
pad_size
,
j
*
self
.
input_size
[
1
]:(
j
+
1
)
*
self
.
input_size
[
1
]]
=
sub_w
out_begin
=
offset
[
i
]
+
pad_size
in_begin
=
offset
[
i
]
if
in_end
>
offset
[
i
+
1
]:
pad_size
=
np
.
min
(
[
in_end
-
offset
[
i
+
1
],
offset
[
i
+
1
]
-
offset
[
i
]])
if
self
.
padding_trainable
:
sub_w
=
pading_data
[
begin_pad
+
self
.
context_start
+
j
-
pad_size
:
begin_pad
+
self
.
context_start
+
j
,
:]
out
[
offset
[
i
+
1
]
-
pad_size
:
offset
[
i
+
1
],
j
*
self
.
input_size
[
1
]:(
j
+
1
)
*
self
.
input_size
[
1
]]
=
sub_w
in_end
=
offset
[
i
+
1
]
out_end
=
offset
[
i
+
1
]
-
pad_size
if
in_end
<=
in_begin
:
continue
in_sub
=
x
[
in_begin
:
in_end
,
:]
out
[
out_begin
:
out_end
,
j
*
self
.
input_size
[
1
]:(
j
+
1
)
*
self
.
input_size
[
1
]]
+=
in_sub
np
.
dot
(
out
,
filter
,
out
=
self
.
outputs
[
'Out'
])
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录