Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
f6a7712f
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f6a7712f
编写于
12月 07, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor dequant fusion kernels to support more fusion patterns
上级
f163d146
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
560 addition
and
503 deletion
+560
-503
src/common/types.cpp
src/common/types.cpp
+3
-1
src/common/types.h
src/common/types.h
+14
-4
src/framework/load_ops.h
src/framework/load_ops.h
+4
-0
src/framework/program/program-optimize/node.cpp
src/framework/program/program-optimize/node.cpp
+10
-1
src/operators/fusion_dequant_add_bn_op.h
src/operators/fusion_dequant_add_bn_op.h
+3
-2
src/operators/fusion_dequant_add_bn_relu_op.h
src/operators/fusion_dequant_add_bn_relu_op.h
+5
-4
src/operators/fusion_dequant_add_bn_relu_quant_op.h
src/operators/fusion_dequant_add_bn_relu_quant_op.h
+5
-3
src/operators/fusion_dequant_bn_op.cpp
src/operators/fusion_dequant_bn_op.cpp
+22
-7
src/operators/fusion_dequant_bn_op.h
src/operators/fusion_dequant_bn_op.h
+101
-0
src/operators/fusion_dequant_bn_relu_op.h
src/operators/fusion_dequant_bn_relu_op.h
+2
-1
src/operators/kernel/arm/dequant_add_bn_kernel.cpp
src/operators/kernel/arm/dequant_add_bn_kernel.cpp
+0
-117
src/operators/kernel/arm/dequantize_bn_kernel.cpp
src/operators/kernel/arm/dequantize_bn_kernel.cpp
+80
-135
src/operators/kernel/arm/quantize_kernel.cpp
src/operators/kernel/arm/quantize_kernel.cpp
+4
-4
src/operators/kernel/arm/relu_kernel.cpp
src/operators/kernel/arm/relu_kernel.cpp
+10
-40
src/operators/kernel/dequant_bn_kernel.h
src/operators/kernel/dequant_bn_kernel.h
+57
-0
src/operators/math/activation.h
src/operators/math/activation.h
+65
-0
src/operators/math/quantize.h
src/operators/math/quantize.h
+8
-8
src/operators/op_param.h
src/operators/op_param.h
+13
-99
test/operators/test_conv_op.cpp
test/operators/test_conv_op.cpp
+150
-77
tools/op.cmake
tools/op.cmake
+4
-0
未找到文件。
src/common/types.cpp
浏览文件 @
f6a7712f
...
...
@@ -72,6 +72,7 @@ const char *G_OP_TYPE_SUM = "sum";
const
char
*
G_OP_TYPE_QUANTIZE
=
"quantize"
;
const
char
*
G_OP_TYPE_DEQUANTIZE
=
"dequantize"
;
const
char
*
G_OP_TYPE_FUSION_DEQUANT_BN
=
"fusion_dequant_bn"
;
const
char
*
G_OP_TYPE_FUSION_DEQUANT_ADD_BN
=
"fusion_dequant_add_bn"
;
const
char
*
G_OP_TYPE_FUSION_DEQUANT_BN_RELU
=
"fusion_dequant_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU
=
"fusion_dequant_add_bn_relu"
;
...
...
@@ -144,7 +145,8 @@ std::unordered_map<
{
G_OP_TYPE_ELEMENTWISE_MUL
,
{{
"X"
,
"Y"
},
{
"Out"
}}},
{
G_OP_TYPE_QUANTIZE
,
{{
"X"
},
{
"Out"
,
"OutScale"
}}},
{
G_OP_TYPE_DEQUANTIZE
,
{{
"X"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DEQUANT_ADD_BN
,
{{
"X"
,
"Scale"
},
{
"Y"
}}},
{
G_OP_TYPE_FUSION_DEQUANT_BN
,
{{
"X"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DEQUANT_ADD_BN
,
{{
"X"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DEQUANT_BN_RELU
,
{{
"X"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU
,
{{
"X"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT
,
...
...
src/common/types.h
浏览文件 @
f6a7712f
...
...
@@ -87,10 +87,19 @@ enum PMStatus {
};
enum
RoundType
{
ROUND_UNK
=
0
,
ROUND_NEAREST_AWAY_ZERO
=
1
,
ROUND_NEAREST_TOWARDS_ZERO
=
2
,
ROUND_NEAREST_TO_EVEN
=
3
ROUND_NEAREST_AWAY_ZERO
=
0
,
ROUND_NEAREST_TOWARDS_ZERO
=
1
,
ROUND_NEAREST_TO_EVEN
=
2
,
};
enum
ActivationType
{
Linear
=
0
,
Relu
=
1
,
Relu6
=
2
,
PRelu
=
3
,
LeakyRelu
=
4
,
Tanh
=
5
,
Sigmoid
=
6
,
};
extern
const
char
*
G_OP_TYPE_CONV
;
...
...
@@ -139,6 +148,7 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
extern
const
char
*
G_OP_TYPE_QUANTIZE
;
extern
const
char
*
G_OP_TYPE_DEQUANTIZE
;
extern
const
char
*
G_OP_TYPE_FUSION_DEQUANT_BN
;
extern
const
char
*
G_OP_TYPE_FUSION_DEQUANT_ADD_BN
;
extern
const
char
*
G_OP_TYPE_FUSION_DEQUANT_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU
;
...
...
src/framework/load_ops.h
浏览文件 @
f6a7712f
...
...
@@ -234,6 +234,10 @@ LOAD_OP1(quantize, CPU);
#ifdef DEQUANT_OP
LOAD_OP1
(
dequantize
,
CPU
);
#endif
#ifdef FUSION_DEQUANT_BN_OP
LOAD_OP1
(
fusion_dequant_bn
,
CPU
);
LOAD_FUSION_MATCHER
(
fusion_dequant_bn
);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_OP
LOAD_OP1
(
fusion_dequant_add_bn
,
CPU
);
LOAD_FUSION_MATCHER
(
fusion_dequant_add_bn
);
...
...
src/framework/program/program-optimize/node.cpp
浏览文件 @
f6a7712f
...
...
@@ -220,7 +220,16 @@ void Node::Folder(
}
}
else
{
for
(
auto
&
op_output
:
this
->
op_desc_
->
outputs_
)
{
op_desc
->
outputs_
.
emplace
(
op_output
.
first
,
op_output
.
second
);
auto
output_key
=
op_output
.
first
;
if
(
change
->
find
(
this
->
type_
)
!=
change
->
end
())
{
const
auto
change_pairs
=
(
*
change
)[
this
->
type_
];
for
(
const
auto
&
target
:
change_pairs
)
{
if
(
target
.
first
==
output_key
)
{
output_key
=
target
.
second
;
}
}
}
op_desc
->
outputs_
.
emplace
(
output_key
,
op_output
.
second
);
}
for
(
auto
&
output
:
this
->
outputs_
)
{
...
...
src/operators/fusion_dequant_add_bn_op.h
浏览文件 @
f6a7712f
...
...
@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_
add_
bn_kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
...
...
@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
{{
"Scale"
,
"BNScale"
},
{
"Mean"
,
"BNMean"
},
{
"Bias"
,
"BNBias"
},
{
"Variance"
,
"BNVariance"
}}}},
{
"Variance"
,
"BNVariance"
},
{
"Y"
,
"Out"
}}}},
removed_nodes
);
}
...
...
src/operators/fusion_dequant_add_bn_relu_op.h
浏览文件 @
f6a7712f
...
...
@@ -20,7 +20,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_
relu_
kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
...
...
@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
{{
"Scale"
,
"BNScale"
},
{
"Mean"
,
"BNMean"
},
{
"Bias"
,
"BNBias"
},
{
"Variance"
,
"BNVariance"
}}}},
{
"Variance"
,
"BNVariance"
},
{
"Y"
,
"Out"
}}}},
removed_nodes
);
}
...
...
@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantAddBNReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionDequantAddBN
Relu
Param
<
DeviceType
>
,
DeviceType
,
FusionDequantAddBNParam
<
DeviceType
>
,
operators
::
FusionDequantAddBNReluKernel
<
DeviceType
,
T
>>
{
public:
FusionDequantAddBNReluOp
(
const
std
::
string
&
type
,
...
...
@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionDequantAddBN
Relu
Param
<
DeviceType
>
,
DeviceType
,
FusionDequantAddBNParam
<
DeviceType
>
,
operators
::
FusionDequantAddBNReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
// inference output shape
...
...
src/operators/fusion_dequant_add_bn_relu_quant_op.h
浏览文件 @
f6a7712f
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_
relu_
kernel.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
...
...
@@ -44,7 +44,8 @@ class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
{{
"Scale"
,
"BNScale"
},
{
"Mean"
,
"BNMean"
},
{
"Bias"
,
"BNBias"
},
{
"Variance"
,
"BNVariance"
}}}},
{
"Variance"
,
"BNVariance"
},
{
"Y"
,
"Out"
}}}},
removed_nodes
);
}
...
...
@@ -90,7 +91,8 @@ class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
{{
"Scale"
,
"BNScale"
},
{
"Mean"
,
"BNMean"
},
{
"Bias"
,
"BNBias"
},
{
"Variance"
,
"BNVariance"
}}}},
{
"Variance"
,
"BNVariance"
},
{
"Y"
,
"Out"
}}}},
removed_nodes
);
}
...
...
src/operators/fusion_dequant_bn_
relu_
op.cpp
→
src/operators/fusion_dequant_bn_op.cpp
浏览文件 @
f6a7712f
...
...
@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_BN_RELU_OP
#include "operators/fusion_dequant_bn_relu_op.h"
#include "operators/fusion_dequant_bn_op.h"
namespace
paddle_mobile
{
namespace
operators
{
#ifdef FUSION_DEQUANT_BN_OP
template
<
typename
Dtype
,
typename
T
>
void
FusionDequantBNOp
<
Dtype
,
T
>::
InferShape
()
const
{
const
auto
&
input_dims
=
this
->
param_
.
input_
->
dims
();
this
->
param_
.
output_
->
Resize
(
input_dims
);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template
<
typename
Dtype
,
typename
T
>
void
FusionDequantBNReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
const
auto
&
input_dims
=
this
->
param_
.
input_
->
dims
();
this
->
param_
.
output_
->
Resize
(
input_dims
);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef FUSION_DEQUANT_BN_OP
REGISTER_FUSION_MATCHER
(
fusion_dequant_bn
,
ops
::
FusionDequantBNMatcher
);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_dequant_bn
,
ops
::
FusionDequantBNOp
);
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
REGISTER_FUSION_MATCHER
(
fusion_dequant_bn_relu
,
ops
::
FusionDequantBNReluMatcher
);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_dequant_bn_relu
,
ops
::
FusionDequantBNReluOp
);
#endif
#endif
#endif // PADDLE_MOBILE_CPU
#endif // FUSION_DEQUANT_BN_RELU_OP
src/operators/fusion_dequant_bn_op.h
0 → 100644
浏览文件 @
f6a7712f
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
class
FusionDequantBNMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionDequantBNMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_DEQUANTIZE
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_BATCHNORM
);
}
virtual
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_BATCHNORM
,
{{
"Scale"
,
"BNScale"
},
{
"Mean"
,
"BNMean"
},
{
"Bias"
,
"BNBias"
},
{
"Variance"
,
"BNVariance"
},
{
"Y"
,
"Out"
}}}},
removed_nodes
);
}
std
::
string
Type
()
override
{
return
G_OP_TYPE_FUSION_DEQUANT_BN
;
}
};
#endif // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_BN_OP
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantBNOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionDequantBNParam
<
DeviceType
>
,
operators
::
FusionDequantBNKernel
<
DeviceType
,
T
>>
{
public:
FusionDequantBNOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionDequantBNParam
<
DeviceType
>
,
operators
::
FusionDequantBNKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
// inference output shape
void
InferShape
()
const
override
;
};
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
class
FusionDequantBNReluMatcher
:
public
FusionDequantBNMatcher
{
public:
FusionDequantBNReluMatcher
()
:
FusionDequantBNMatcher
()
{
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_RELU
);
}
virtual
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_DEQUANT_BN_RELU
;
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantBNReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionDequantBNParam
<
DeviceType
>
,
operators
::
FusionDequantBNReluKernel
<
DeviceType
,
T
>>
{
public:
FusionDequantBNReluOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionDequantBNParam
<
DeviceType
>
,
operators
::
FusionDequantBNReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
};
#endif // FUSION_DEQUANT_BN_RELU_OP
}
// namespace operators
}
// namespace paddle_mobile
src/operators/fusion_dequant_bn_relu_op.h
浏览文件 @
f6a7712f
...
...
@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
{{
"Scale"
,
"BNScale"
},
{
"Mean"
,
"BNMean"
},
{
"Bias"
,
"BNBias"
},
{
"Variance"
,
"BNVariance"
}}}},
{
"Variance"
,
"BNVariance"
},
{
"Y"
,
"Out"
}}}},
removed_nodes
);
}
...
...
src/operators/kernel/arm/dequant_add_bn_kernel.cpp
已删除
100644 → 0
浏览文件 @
f163d146
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "operators/kernel/dequant_add_bn_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
FusionDequantAddBNKernel
<
CPU
,
float
>::
Init
(
FusionDequantAddBNParam
<
CPU
>
*
param
)
{
// elementwise add params
const
Tensor
*
bias
=
param
->
bias_
;
// batch norm params
const
Tensor
*
bn_mean
=
param
->
bn_mean_
;
const
Tensor
*
bn_variance
=
param
->
bn_variance_
;
Tensor
*
bn_scale
=
param
->
bn_scale_
;
Tensor
*
bn_bias
=
param
->
bn_bias_
;
const
float
epsilon
=
param
->
epsilon_
;
const
float
*
bias_ptr
=
bias
->
data
<
float
>
();
const
float
*
mean_ptr
=
bn_mean
->
data
<
float
>
();
const
float
*
var_ptr
=
bn_variance
->
data
<
float
>
();
float
*
bn_scale_ptr
=
bn_scale
->
mutable_data
<
float
>
();
float
*
bn_bias_ptr
=
bn_bias
->
mutable_data
<
float
>
();
for
(
int
c
=
0
;
c
<
bn_scale
->
numel
();
++
c
)
{
float
inv_scale
=
bn_scale_ptr
[
c
]
/
(
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon
));
bn_scale_ptr
[
c
]
=
inv_scale
;
bn_bias_ptr
[
c
]
=
inv_scale
*
(
bias_ptr
[
c
]
-
mean_ptr
[
c
])
+
bn_bias_ptr
[
c
];
}
return
true
;
}
template
<
>
void
FusionDequantAddBNKernel
<
CPU
,
float
>::
Compute
(
const
FusionDequantAddBNParam
<
CPU
>
&
param
)
{
const
int32_t
*
input
=
param
.
input_
->
data
<
int32_t
>
();
const
float
*
bn_scale
=
param
.
bn_scale_
->
data
<
float
>
();
const
float
*
bn_bias
=
param
.
bn_bias_
->
data
<
float
>
();
// dequantize params
const
float
activation_scale
=
param
.
activation_scale_
->
data
<
float
>
()[
0
];
const
float
weight_scale
=
param
.
weight_scale_
;
const
float
dequant_scale
=
activation_scale
/
weight_scale
;
float
*
output
=
param
.
output_
->
mutable_data
<
float
>
();
int
batch_size
=
param
.
input_
->
dims
()[
0
];
int
channels
=
param
.
input_
->
dims
()[
1
];
size_t
spatial_size
=
param
.
input_
->
dims
()[
2
]
*
param
.
input_
->
dims
()[
3
];
#pragma omp parallel for collapse(2)
for
(
int
batch
=
0
;
batch
<
batch_size
;
++
batch
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
// not fuse bn and dequant scale to minimize precision difference
// float scale = bn_scale[c] * dequant_scale;
float
scale
=
bn_scale
[
c
];
float
bias
=
bn_bias
[
c
];
size_t
offset
=
(
batch
*
channels
+
c
)
*
spatial_size
;
const
int32_t
*
x
=
input
+
offset
;
float
*
y
=
output
+
offset
;
size_t
remain
=
spatial_size
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int
loop
=
spatial_size
>>
4
;
remain
=
spatial_size
&
0xF
;
float32x4_t
__dequant_scale
=
vdupq_n_f32
(
dequant_scale
);
float32x4_t
__scale
=
vdupq_n_f32
(
scale
);
float32x4_t
__bias
=
vdupq_n_f32
(
bias
);
for
(
int
k
=
0
;
k
<
loop
;
++
k
,
x
+=
16
,
y
+=
16
)
{
int32x4_t
r0
=
vld1q_s32
(
x
);
int32x4_t
r1
=
vld1q_s32
(
x
+
4
);
int32x4_t
r2
=
vld1q_s32
(
x
+
8
);
int32x4_t
r3
=
vld1q_s32
(
x
+
12
);
float32x4_t
f0
=
vcvtq_f32_s32
(
r0
);
float32x4_t
f1
=
vcvtq_f32_s32
(
r1
);
float32x4_t
f2
=
vcvtq_f32_s32
(
r2
);
float32x4_t
f3
=
vcvtq_f32_s32
(
r3
);
f0
=
vmulq_f32
(
__dequant_scale
,
f0
);
f1
=
vmulq_f32
(
__dequant_scale
,
f1
);
f2
=
vmulq_f32
(
__dequant_scale
,
f2
);
f3
=
vmulq_f32
(
__dequant_scale
,
f3
);
f0
=
vmlaq_f32
(
__bias
,
__scale
,
f0
);
f1
=
vmlaq_f32
(
__bias
,
__scale
,
f1
);
f2
=
vmlaq_f32
(
__bias
,
__scale
,
f2
);
f3
=
vmlaq_f32
(
__bias
,
__scale
,
f3
);
vst1q_f32
(
y
,
f0
);
vst1q_f32
(
y
+
4
,
f1
);
vst1q_f32
(
y
+
8
,
f2
);
vst1q_f32
(
y
+
12
,
f3
);
}
#endif // __ARM_NEON__
for
(
int
k
=
0
;
k
<
remain
;
++
k
)
{
y
[
k
]
=
scale
*
(
dequant_scale
*
x
[
k
])
+
bias
;
}
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_DEQUANT_ADD_BN_OP
src/operators/kernel/arm/dequant
_bn_relu
_kernel.cpp
→
src/operators/kernel/arm/dequant
ize_bn
_kernel.cpp
浏览文件 @
f6a7712f
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
f
8 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/dequant_bn_relu_kernel.h"
#include <cmath>
#include "operators/kernel/dequant_bn_kernel.h"
#include "operators/math/activation.h"
#include "operators/math/quantize.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
...
...
@@ -22,8 +23,10 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
#if defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
void
PublicFusionDequantBNInitParam
(
FusionDequantBNParam
<
CPU
>
*
param
,
const
framework
::
Tensor
*
bias
)
{
...
...
@@ -39,16 +42,20 @@ void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
float
*
bn_scale_ptr
=
bn_scale
->
mutable_data
<
float
>
();
float
*
bn_bias_ptr
=
bn_bias
->
mutable_data
<
float
>
();
for
(
int
c
=
0
;
c
<
bn_scale
->
numel
();
++
c
)
{
float
inv_scale
=
bn_scale_ptr
[
c
]
/
(
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon
));
bn_scale_ptr
[
c
]
=
inv_scale
;
float
inv_scale
=
1.
f
/
(
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon
));
float
val
=
bias
?
bias
->
data
<
float
>
()[
c
]
:
0
;
bn_bias_ptr
[
c
]
=
inv_scale
*
(
val
-
mean_ptr
[
c
])
+
bn_bias_ptr
[
c
];
bn_bias_ptr
[
c
]
=
inv_scale
*
bn_scale_ptr
[
c
]
*
(
val
-
mean_ptr
[
c
])
+
bn_bias_ptr
[
c
];
bn_scale_ptr
[
c
]
=
inv_scale
*
bn_scale_ptr
[
c
];
}
}
#endif
#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
void
DequantBNReluCompute
(
const
FusionDequantBNParam
<
CPU
>
*
param
)
{
#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
template
<
ActivationType
Act
>
void
DequantBNCompute
(
const
FusionDequantBNParam
<
CPU
>
*
param
)
{
const
int32_t
*
input
=
param
->
input_
->
data
<
int32_t
>
();
const
float
*
bn_scale
=
param
->
bn_scale_
->
data
<
float
>
();
const
float
*
bn_bias
=
param
->
bn_bias_
->
data
<
float
>
();
...
...
@@ -79,7 +86,6 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
float32x4_t
__dequant_scale
=
vdupq_n_f32
(
dequant_scale
);
float32x4_t
__scale
=
vdupq_n_f32
(
scale
);
float32x4_t
__bias
=
vdupq_n_f32
(
bias
);
float32x4_t
__zero
=
vdupq_n_f32
(
0.
f
);
for
(
int
k
=
0
;
k
<
loop
;
++
k
,
x
+=
16
,
y
+=
16
)
{
int32x4_t
r0
=
vld1q_s32
(
x
);
int32x4_t
r1
=
vld1q_s32
(
x
+
4
);
...
...
@@ -97,10 +103,10 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
f1
=
vmlaq_f32
(
__bias
,
__scale
,
f1
);
f2
=
vmlaq_f32
(
__bias
,
__scale
,
f2
);
f3
=
vmlaq_f32
(
__bias
,
__scale
,
f3
);
f0
=
vmaxq_f32
(
__zero
,
f0
);
f1
=
vmaxq_f32
(
__zero
,
f1
);
f2
=
vmaxq_f32
(
__zero
,
f2
);
f3
=
vmaxq_f32
(
__zero
,
f3
);
f0
=
math
::
vActiveq_f32
(
f0
);
f1
=
math
::
vActiveq_f32
(
f1
);
f2
=
math
::
vActiveq_f32
(
f2
);
f3
=
math
::
vActiveq_f32
(
f3
);
vst1q_f32
(
y
,
f0
);
vst1q_f32
(
y
+
4
,
f1
);
vst1q_f32
(
y
+
8
,
f2
);
...
...
@@ -108,32 +114,62 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
}
#endif // __ARM_NEON__
for
(
int
k
=
0
;
k
<
remain
;
++
k
)
{
y
[
k
]
=
std
::
max
(
scale
*
(
dequant_scale
*
x
[
k
])
+
bias
,
0.
f
);
y
[
k
]
=
math
::
Active
<
Act
>
(
scale
*
(
dequant_scale
*
x
[
k
])
+
bias
);
}
}
}
}
#endif
#ifdef FUSION_DEQUANT_BN_OP
template
<
>
bool
FusionDequantBNKernel
<
CPU
,
float
>::
Init
(
FusionDequantBNParam
<
CPU
>
*
param
)
{
PublicFusionDequantBNInitParam
(
param
,
nullptr
);
return
true
;
}
template
<
>
void
FusionDequantBNKernel
<
CPU
,
float
>::
Compute
(
const
FusionDequantBNParam
<
CPU
>
&
param
)
{
DequantBNCompute
<
Linear
>
(
&
param
);
}
#endif // FUSION_DEQUANT_BN_OP
#ifdef FUSION_DEQUANT_BN_RELU_OP
template
<
>
bool
FusionDequantBNReluKernel
<
CPU
,
float
>::
Init
(
FusionDequantBN
Relu
Param
<
CPU
>
*
param
)
{
FusionDequantBNParam
<
CPU
>
*
param
)
{
PublicFusionDequantBNInitParam
(
param
,
nullptr
);
return
true
;
}
template
<
>
void
FusionDequantBNReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionDequantBN
Relu
Param
<
CPU
>
&
param
)
{
DequantBN
ReluCompute
(
&
param
);
const
FusionDequantBNParam
<
CPU
>
&
param
)
{
DequantBN
Compute
<
Relu
>
(
&
param
);
}
#endif // FUSION_DEQUANT_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_OP
template
<
>
bool
FusionDequantAddBNKernel
<
CPU
,
float
>::
Init
(
FusionDequantAddBNParam
<
CPU
>
*
param
)
{
const
framework
::
Tensor
*
bias
=
param
->
bias_
;
PublicFusionDequantBNInitParam
(
param
,
bias
);
return
true
;
}
template
<
>
void
FusionDequantAddBNKernel
<
CPU
,
float
>::
Compute
(
const
FusionDequantAddBNParam
<
CPU
>
&
param
)
{
DequantBNCompute
<
Linear
>
(
&
param
);
}
#endif // FUSION_DEQUANT_ADD_BN_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template
<
>
bool
FusionDequantAddBNReluKernel
<
CPU
,
float
>::
Init
(
FusionDequantAddBN
Relu
Param
<
CPU
>
*
param
)
{
FusionDequantAddBNParam
<
CPU
>
*
param
)
{
const
framework
::
Tensor
*
bias
=
param
->
bias_
;
PublicFusionDequantBNInitParam
(
param
,
bias
);
return
true
;
...
...
@@ -141,13 +177,14 @@ bool FusionDequantAddBNReluKernel<CPU, float>::Init(
template
<
>
void
FusionDequantAddBNReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionDequantAddBN
Relu
Param
<
CPU
>
&
param
)
{
DequantBN
ReluCompute
(
&
param
);
const
FusionDequantAddBNParam
<
CPU
>
&
param
)
{
DequantBN
Compute
<
Relu
>
(
&
param
);
}
#endif // FUSION_DEQUANT_ADD_BN_RELU_OP
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
template
<
RoundType
R
>
#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
template
<
Activation
Act
,
RoundType
R
>
void
DequantBNQuantCompute
(
const
FusionDequantAddBNQuantParam
<
CPU
>
*
param
)
{
const
int32_t
*
input
=
param
->
input_
->
data
<
int32_t
>
();
const
float
*
bn_scale
=
param
->
bn_scale_
->
data
<
float
>
();
...
...
@@ -167,9 +204,8 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
// if (param->is_static_) {
if
(
true
)
{
max_abs
=
param
->
offline_scale_
->
data
<
float
>
()[
0
]
;
max_abs
=
param
->
static_scale_
;
float
quant_scale
=
127.
f
/
max_abs
;
#pragma omp parallel for collapse(2)
for
(
int
batch
=
0
;
batch
<
batch_size
;
++
batch
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
...
...
@@ -205,14 +241,18 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
f1
=
vmlaq_f32
(
__bias
,
__scale
,
f1
);
f2
=
vmlaq_f32
(
__bias
,
__scale
,
f2
);
f3
=
vmlaq_f32
(
__bias
,
__scale
,
f3
);
f0
=
math
::
vActiveq_f32
(
f0
);
f1
=
math
::
vActiveq_f32
(
f1
);
f2
=
math
::
vActiveq_f32
(
f2
);
f3
=
math
::
vActiveq_f32
(
f3
);
f0
=
vmulq_f32
(
__quant_scale
,
f0
);
f1
=
vmulq_f32
(
__quant_scale
,
f1
);
f2
=
vmulq_f32
(
__quant_scale
,
f2
);
f3
=
vmulq_f32
(
__quant_scale
,
f3
);
int32x4_t
q0
=
math
::
v
round
_f32
<
R
>
(
f0
);
int32x4_t
q1
=
math
::
v
round
_f32
<
R
>
(
f1
);
int32x4_t
q2
=
math
::
v
round
_f32
<
R
>
(
f2
);
int32x4_t
q3
=
math
::
v
round
_f32
<
R
>
(
f3
);
int32x4_t
q0
=
math
::
v
Roundq
_f32
<
R
>
(
f0
);
int32x4_t
q1
=
math
::
v
Roundq
_f32
<
R
>
(
f1
);
int32x4_t
q2
=
math
::
v
Roundq
_f32
<
R
>
(
f2
);
int32x4_t
q3
=
math
::
v
Roundq
_f32
<
R
>
(
f3
);
int16x4_t
d0
=
vmovn_s32
(
q0
);
int16x4_t
d1
=
vmovn_s32
(
q1
);
int16x4_t
d2
=
vmovn_s32
(
q2
);
...
...
@@ -226,13 +266,14 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
}
#endif // __ARM_NEON__
for
(
int
k
=
0
;
k
<
remain
;
++
k
)
{
float
x_temp
=
scale
*
(
dequant_scale
*
x
[
k
])
+
bias
;
float
x_temp
=
math
::
Active
<
Act
>
(
scale
*
(
dequant_scale
*
x
[
k
])
+
bias
);
y
[
k
]
=
math
::
Round
<
R
>
(
x_temp
*
quant_scale
);
}
}
}
}
else
{
// TODO
(hjchen2)
// TODO
max_abs
=
std
::
max
(
max_abs
,
1e-6
f
);
}
param
->
online_scale_
->
mutable_data
<
float
>
()[
0
]
=
max_abs
;
...
...
@@ -251,13 +292,13 @@ void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
const
FusionDequantAddBNQuantParam
<
CPU
>
&
param
)
{
switch
(
param
.
round_type_
)
{
case
ROUND_NEAREST_TO_EVEN
:
DequantBNQuantCompute
<
ROUND_NEAREST_TO_EVEN
>
(
&
param
);
DequantBNQuantCompute
<
Linear
,
ROUND_NEAREST_TO_EVEN
>
(
&
param
);
break
;
case
ROUND_NEAREST_TOWARDS_ZERO
:
DequantBNQuantCompute
<
ROUND_NEAREST_TOWARDS_ZERO
>
(
&
param
);
DequantBNQuantCompute
<
Linear
,
ROUND_NEAREST_TOWARDS_ZERO
>
(
&
param
);
break
;
case
ROUND_NEAREST_AWAY_ZERO
:
DequantBNQuantCompute
<
ROUND_NEAREST_AWAY_ZERO
>
(
&
param
);
DequantBNQuantCompute
<
Linear
,
ROUND_NEAREST_AWAY_ZERO
>
(
&
param
);
break
;
default:
LOG
(
kLOG_ERROR
)
<<
"round type is not supported."
;
...
...
@@ -267,105 +308,9 @@ void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
#endif // FUSION_DEQUANT_ADD_BN_QUANT_OP
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template
<
RoundType
R
>
void
DequantBNReluQuantCompute
(
const
FusionDequantAddBNReluQuantParam
<
CPU
>
*
param
)
{
const
int32_t
*
input
=
param
->
input_
->
data
<
int32_t
>
();
const
float
*
bn_scale
=
param
->
bn_scale_
->
data
<
float
>
();
const
float
*
bn_bias
=
param
->
bn_bias_
->
data
<
float
>
();
// dequantize params
const
float
activation_scale
=
param
->
activation_scale_
->
data
<
float
>
()[
0
];
const
float
weight_scale
=
param
->
weight_scale_
;
const
float
dequant_scale
=
activation_scale
/
weight_scale
;
// quantize params
Tensor
*
output_scale
=
param
->
online_scale_
;
float
max_abs
=
0.
f
;
int8_t
*
output
=
param
->
output_
->
mutable_data
<
int8_t
>
();
int
batch_size
=
param
->
input_
->
dims
()[
0
];
int
channels
=
param
->
input_
->
dims
()[
1
];
size_t
spatial_size
=
param
->
input_
->
dims
()[
2
]
*
param
->
input_
->
dims
()[
3
];
// if (param->is_static_) {
if
(
true
)
{
max_abs
=
param
->
offline_scale_
->
data
<
float
>
()[
0
];
float
quant_scale
=
127.
f
/
max_abs
;
#pragma omp parallel for collapse(2)
for
(
int
batch
=
0
;
batch
<
batch_size
;
++
batch
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
// float scale = bn_scale[c] * dequant_scale;
float
scale
=
bn_scale
[
c
];
float
bias
=
bn_bias
[
c
];
size_t
offset
=
(
batch
*
channels
+
c
)
*
spatial_size
;
const
int32_t
*
x
=
input
+
offset
;
int8_t
*
y
=
output
+
offset
;
size_t
remain
=
spatial_size
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
int
loop
=
spatial_size
>>
4
;
remain
=
spatial_size
&
0xF
;
float32x4_t
__dequant_scale
=
vdupq_n_f32
(
dequant_scale
);
float32x4_t
__scale
=
vdupq_n_f32
(
scale
);
float32x4_t
__bias
=
vdupq_n_f32
(
bias
);
float32x4_t
__zero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
__quant_scale
=
vdupq_n_f32
(
quant_scale
);
for
(
int
k
=
0
;
k
<
loop
;
++
k
,
x
+=
16
,
y
+=
16
)
{
int32x4_t
r0
=
vld1q_s32
(
x
);
int32x4_t
r1
=
vld1q_s32
(
x
+
4
);
int32x4_t
r2
=
vld1q_s32
(
x
+
8
);
int32x4_t
r3
=
vld1q_s32
(
x
+
12
);
float32x4_t
f0
=
vcvtq_f32_s32
(
r0
);
float32x4_t
f1
=
vcvtq_f32_s32
(
r1
);
float32x4_t
f2
=
vcvtq_f32_s32
(
r2
);
float32x4_t
f3
=
vcvtq_f32_s32
(
r3
);
f0
=
vmulq_f32
(
__dequant_scale
,
f0
);
f1
=
vmulq_f32
(
__dequant_scale
,
f1
);
f2
=
vmulq_f32
(
__dequant_scale
,
f2
);
f3
=
vmulq_f32
(
__dequant_scale
,
f3
);
f0
=
vmlaq_f32
(
__bias
,
__scale
,
f0
);
f1
=
vmlaq_f32
(
__bias
,
__scale
,
f1
);
f2
=
vmlaq_f32
(
__bias
,
__scale
,
f2
);
f3
=
vmlaq_f32
(
__bias
,
__scale
,
f3
);
f0
=
vmaxq_f32
(
__zero
,
f0
);
f1
=
vmaxq_f32
(
__zero
,
f1
);
f2
=
vmaxq_f32
(
__zero
,
f2
);
f3
=
vmaxq_f32
(
__zero
,
f3
);
f0
=
vmulq_f32
(
__quant_scale
,
f0
);
f1
=
vmulq_f32
(
__quant_scale
,
f1
);
f2
=
vmulq_f32
(
__quant_scale
,
f2
);
f3
=
vmulq_f32
(
__quant_scale
,
f3
);
int32x4_t
q0
=
math
::
vround_f32
<
R
>
(
f0
);
int32x4_t
q1
=
math
::
vround_f32
<
R
>
(
f1
);
int32x4_t
q2
=
math
::
vround_f32
<
R
>
(
f2
);
int32x4_t
q3
=
math
::
vround_f32
<
R
>
(
f3
);
int16x4_t
d0
=
vmovn_s32
(
q0
);
int16x4_t
d1
=
vmovn_s32
(
q1
);
int16x4_t
d2
=
vmovn_s32
(
q2
);
int16x4_t
d3
=
vmovn_s32
(
q3
);
int16x8_t
q5
=
vcombine_s16
(
d0
,
d1
);
int16x8_t
q6
=
vcombine_s16
(
d2
,
d3
);
int8x8_t
d5
=
vmovn_s16
(
q5
);
int8x8_t
d6
=
vmovn_s16
(
q6
);
vst1_s8
(
y
,
d5
);
vst1_s8
(
y
+
8
,
d6
);
}
#endif // __ARM_NEON__
for
(
int
k
=
0
;
k
<
remain
;
++
k
)
{
float
x_temp
=
std
::
max
(
scale
*
(
dequant_scale
*
x
[
k
])
+
bias
,
0.
f
);
y
[
k
]
=
math
::
Round
<
R
>
(
x_temp
*
quant_scale
);
}
}
}
}
else
{
// TODO(hjchen2)
max_abs
=
std
::
max
(
max_abs
,
1e-6
f
);
}
param
->
online_scale_
->
mutable_data
<
float
>
()[
0
]
=
max_abs
;
}
template
<
>
bool
FusionDequantAddBNReluQuantKernel
<
CPU
,
float
>::
Init
(
FusionDequantAddBN
Relu
QuantParam
<
CPU
>
*
param
)
{
FusionDequantAddBNQuantParam
<
CPU
>
*
param
)
{
const
framework
::
Tensor
*
bias
=
param
->
bias_
;
PublicFusionDequantBNInitParam
(
param
,
bias
);
return
true
;
...
...
@@ -373,16 +318,16 @@ bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
template
<
>
void
FusionDequantAddBNReluQuantKernel
<
CPU
,
float
>::
Compute
(
const
FusionDequantAddBN
Relu
QuantParam
<
CPU
>
&
param
)
{
const
FusionDequantAddBNQuantParam
<
CPU
>
&
param
)
{
switch
(
param
.
round_type_
)
{
case
ROUND_NEAREST_TO_EVEN
:
DequantBN
ReluQuantCompute
<
ROUND_NEAREST_TO_EVEN
>
(
&
param
);
DequantBN
QuantCompute
<
Relu
,
ROUND_NEAREST_TO_EVEN
>
(
&
param
);
break
;
case
ROUND_NEAREST_TOWARDS_ZERO
:
DequantBN
ReluQuantCompute
<
ROUND_NEAREST_TOWARDS_ZERO
>
(
&
param
);
DequantBN
QuantCompute
<
Relu
,
ROUND_NEAREST_TOWARDS_ZERO
>
(
&
param
);
break
;
case
ROUND_NEAREST_AWAY_ZERO
:
DequantBN
ReluQuantCompute
<
ROUND_NEAREST_AWAY_ZERO
>
(
&
param
);
DequantBN
QuantCompute
<
Relu
,
ROUND_NEAREST_AWAY_ZERO
>
(
&
param
);
break
;
default:
LOG
(
kLOG_ERROR
)
<<
"round type is not supported."
;
...
...
src/operators/kernel/arm/quantize_kernel.cpp
浏览文件 @
f6a7712f
...
...
@@ -53,10 +53,10 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
r1
=
vmulq_f32
(
r1
,
__scale
);
r2
=
vmulq_f32
(
r2
,
__scale
);
r3
=
vmulq_f32
(
r3
,
__scale
);
int32x4_t
q0
=
math
::
v
round
_f32
<
R
>
(
r0
);
int32x4_t
q1
=
math
::
v
round
_f32
<
R
>
(
r1
);
int32x4_t
q2
=
math
::
v
round
_f32
<
R
>
(
r2
);
int32x4_t
q3
=
math
::
v
round
_f32
<
R
>
(
r3
);
int32x4_t
q0
=
math
::
v
Roundq
_f32
<
R
>
(
r0
);
int32x4_t
q1
=
math
::
v
Roundq
_f32
<
R
>
(
r1
);
int32x4_t
q2
=
math
::
v
Roundq
_f32
<
R
>
(
r2
);
int32x4_t
q3
=
math
::
v
Roundq
_f32
<
R
>
(
r3
);
int16x4_t
d0
=
vmovn_s32
(
q0
);
int16x4_t
d1
=
vmovn_s32
(
q1
);
int16x4_t
d2
=
vmovn_s32
(
q2
);
...
...
src/operators/kernel/arm/relu_kernel.cpp
浏览文件 @
f6a7712f
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef RELU_OP
#include "operators/kernel/relu_kernel.h"
#include "common/types.h"
#include "operators/math/activation.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
...
...
@@ -22,45 +24,13 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
enum
ReluMode
{
Relu
=
0
,
Relu6
=
1
,
PRelu
=
2
,
LeakyRelu
=
3
,
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template
<
ReluMode
R
=
Relu
>
inline
float32x4_t
vRelu_f32
(
const
float32x4_t
&
x
)
{
float32x4_t
__zero
=
vdupq_n_f32
(
0.
f
);
return
vmaxq_f32
(
__zero
,
x
);
}
template
<
>
inline
float32x4_t
vRelu_f32
<
Relu6
>
(
const
float32x4_t
&
x
)
{
float32x4_t
__zero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
__six
=
vdupq_n_f32
(
6.
f
);
return
vminq_f32
(
__six
,
vmaxq_f32
(
__zero
,
x
));
}
#endif
template
<
ReluMode
R
=
Relu
>
inline
float
ReluFunc
(
const
float
&
x
)
{
return
std
::
max
(
x
,
0.
f
);
}
template
<
>
inline
float
ReluFunc
<
Relu6
>
(
const
float
&
x
)
{
return
std
::
min
(
std
::
max
(
x
,
0.
f
),
6.
f
);
}
template
<
typename
Dtype
,
ReluMode
R
>
template
<
typename
Dtype
,
ActivationType
Act
>
struct
ReluCompute
{
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
)
{}
};
template
<
ReluMode
R
>
struct
ReluCompute
<
float
,
R
>
{
template
<
ActivationType
Act
>
struct
ReluCompute
<
float
,
Act
>
{
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
)
{
const
float
*
x
=
input
->
data
<
float
>
();
float
*
y
=
output
->
mutable_data
<
float
>
();
...
...
@@ -77,10 +47,10 @@ struct ReluCompute<float, R> {
float32x4_t
r1
=
vld1q_f32
(
local_x
+
4
);
float32x4_t
r2
=
vld1q_f32
(
local_x
+
8
);
float32x4_t
r3
=
vld1q_f32
(
local_x
+
12
);
r0
=
vRelu_f32
<
R
>
(
r0
);
r1
=
vRelu_f32
<
R
>
(
r1
);
r2
=
vRelu_f32
<
R
>
(
r2
);
r3
=
vRelu_f32
<
R
>
(
r3
);
r0
=
math
::
vActiveq_f32
<
Act
>
(
r0
);
r1
=
math
::
vActiveq_f32
<
Act
>
(
r1
);
r2
=
math
::
vActiveq_f32
<
Act
>
(
r2
);
r3
=
math
::
vActiveq_f32
<
Act
>
(
r3
);
vst1q_f32
(
local_y
,
r0
);
vst1q_f32
(
local_y
+
4
,
r1
);
vst1q_f32
(
local_y
+
8
,
r2
);
...
...
@@ -90,7 +60,7 @@ struct ReluCompute<float, R> {
y
+=
(
loop
<<
4
);
#endif
for
(
size_t
i
=
0
;
i
<
remain
;
++
i
)
{
y
[
i
]
=
ReluFunc
<
R
>
(
x
[
i
]);
y
[
i
]
=
math
::
Active
<
Act
>
(
x
[
i
]);
}
}
};
...
...
src/operators/kernel/dequant_bn_
relu_
kernel.h
→
src/operators/kernel/dequant_bn_kernel.h
浏览文件 @
f6a7712f
...
...
@@ -20,48 +20,37 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
#define DECLARE_KERNEL(KernelClass, KernelParam) \
template <typename DeviceType, typename T> \
class KernelClass \
: public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
public: \
bool Init(KernelParam<DeviceType> *param); \
void Compute(const KernelParam<DeviceType> ¶m); \
};
#ifdef FUSION_DEQUANT_BN_OP
DECLARE_KERNEL
(
FusionDequantBNKernel
,
FusionDequantBNParam
);
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantBNReluKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
FusionDequantBNReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionDequantBNReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionDequantBNReluParam
<
DeviceType
>
*
param
);
};
DECLARE_KERNEL
(
FusionDequantBNReluKernel
,
FusionDequantBNParam
);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantAddBNReluKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
FusionDequantAddBNReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionDequantAddBNReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionDequantAddBNReluParam
<
DeviceType
>
*
param
);
};
#ifdef FUSION_DEQUANT_ADD_BN_OP
DECLARE_KERNEL
(
FusionDequantAddBNKernel
,
FusionDequantAddBNParam
);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantAddBNReluQuantKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
FusionDequantAddBNReluQuantParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionDequantAddBNReluQuantParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionDequantAddBNReluQuantParam
<
DeviceType
>
*
param
);
};
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
DECLARE_KERNEL
(
FusionDequantAddBNReluKernel
,
FusionDequantAddBNParam
);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantAddBNQuantKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
FusionDequantAddBNQuantParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionDequantAddBNQuantParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionDequantAddBNQuantParam
<
DeviceType
>
*
param
);
};
DECLARE_KERNEL
(
FusionDequantAddBNQuantKernel
,
FusionDequantAddBNQuantParam
);
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
DECLARE_KERNEL
(
FusionDequantAddBNReluQuantKernel
,
FusionDequantAddBNQuantParam
);
#endif
}
// namespace operators
...
...
src/operators/
kernel/dequant_add_bn_kernel
.h
→
src/operators/
math/activation
.h
浏览文件 @
f6a7712f
...
...
@@ -14,24 +14,52 @@ limitations under the License. */
#pragma once
#ifdef FUSION_DEQUANT_ADD_BN_OP
#include "framework/operator.h"
#include "operators/op_param.h"
#include <algorithm>
#include <cmath>
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
template
<
typename
DeviceType
,
typename
T
>
class
FusionDequantAddBNKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
FusionDequantAddBNParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionDequantAddBNParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionDequantAddBNParam
<
DeviceType
>
*
param
);
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template
<
ActivationType
Act
=
Linear
>
inline
float32x4_t
vActiveq_f32
(
const
float32x4_t
&
x
)
{
return
x
;
}
}
// namespace operators
}
// namespace paddle_mobile
template
<
>
inline
float32x4_t
vActiveq_f32
<
Relu
>
(
const
float32x4_t
&
x
)
{
float32x4_t
__zero
=
vdupq_n_f32
(
0.
f
);
return
vmaxq_f32
(
x
,
__zero
);
}
template
<
>
inline
float32x4_t
vActiveq_f32
<
Relu6
>
(
const
float32x4_t
&
x
)
{
float32x4_t
__zero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
__six
=
vdupq_n_f32
(
6.
f
);
return
vminq_f32
(
vmaxq_f32
(
x
,
__zero
),
__six
);
}
#endif
template
<
ActivationType
Act
=
Linear
>
inline
float
Active
(
const
float
&
x
)
{
return
x
;
}
template
<
>
inline
float
Active
<
Relu
>
(
const
float
&
x
)
{
return
std
::
max
(
x
,
0.
f
);
}
template
<
>
inline
float
Active
<
Relu6
>
(
const
float
&
x
)
{
return
std
::
min
(
std
::
max
(
x
,
0.
f
),
6.
f
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/quantize.h
浏览文件 @
f6a7712f
...
...
@@ -50,31 +50,31 @@ inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template
<
RoundType
R
=
ROUND_NEAREST_TOWARDS_ZERO
>
inline
int32x4_t
v
round_f32
(
float32x4_t
r
)
{
return
vcvtq_s32_f32
(
r
);
inline
int32x4_t
v
Roundq_f32
(
const
float32x4_t
&
x
)
{
return
vcvtq_s32_f32
(
x
);
}
template
<
>
inline
int32x4_t
v
round_f32
<
ROUND_NEAREST_AWAY_ZERO
>
(
float32x4_t
r
)
{
inline
int32x4_t
v
Roundq_f32
<
ROUND_NEAREST_AWAY_ZERO
>
(
const
float32x4_t
&
x
)
{
float32x4_t
plus
=
vdupq_n_f32
(
0.5
);
float32x4_t
minus
=
vdupq_n_f32
(
-
0.5
);
float32x4_t
zero
=
vdupq_n_f32
(
0
);
uint32x4_t
more_than_zero
=
vcgtq_f32
(
r
,
zero
);
uint32x4_t
more_than_zero
=
vcgtq_f32
(
x
,
zero
);
float32x4_t
temp
=
vbslq_f32
(
more_than_zero
,
plus
,
minus
);
temp
=
vaddq_f32
(
r
,
temp
);
temp
=
vaddq_f32
(
x
,
temp
);
int32x4_t
ret
=
vcvtq_s32_f32
(
temp
);
return
ret
;
}
template
<
>
inline
int32x4_t
v
round_f32
<
ROUND_NEAREST_TO_EVEN
>
(
float32x4_t
r
)
{
inline
int32x4_t
v
Roundq_f32
<
ROUND_NEAREST_TO_EVEN
>
(
const
float32x4_t
&
x
)
{
float32x4_t
point5
=
vdupq_n_f32
(
0.5
);
int32x4_t
one
=
vdupq_n_s32
(
1
);
int32x4_t
zero
=
vdupq_n_s32
(
0
);
int32x4_t
rnd
=
vround_f32
<
ROUND_NEAREST_AWAY_ZERO
>
(
r
);
int32x4_t
rnd
=
math
::
vRoundq_f32
<
ROUND_NEAREST_AWAY_ZERO
>
(
x
);
float32x4_t
frnd
=
vcvtq_f32_s32
(
rnd
);
frnd
=
vsubq_f32
(
frnd
,
r
);
frnd
=
vsubq_f32
(
frnd
,
x
);
frnd
=
vabsq_f32
(
frnd
);
uint32x4_t
equal_point5
=
vceqq_f32
(
frnd
,
point5
);
int32x4_t
abs_rnd
=
vabsq_s32
(
rnd
);
...
...
src/operators/op_param.h
浏览文件 @
f6a7712f
...
...
@@ -2530,7 +2530,7 @@ class QuantizeParam : public OpParam {
// scale = max(abs(x))
online_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"OutScale"
,
outputs
,
scope
);
// offline
if
(
OpParam
::
HasAttr
(
"InScale"
,
attrs
))
{
if
(
inputs
.
count
(
"InScale"
))
{
offline_
=
true
;
offline_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"InScale"
,
inputs
,
scope
);
}
...
...
@@ -2566,9 +2566,7 @@ class DequantizeParam : public OpParam {
DequantizeParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
if
(
outputs
.
count
(
"Out"
))
{
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
}
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
activation_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"Scale"
,
inputs
,
scope
);
// dequantization is performed as x = x / static_scale / online_scale
if
(
OpParam
::
HasAttr
(
"weight_scale"
,
attrs
))
{
...
...
@@ -2588,10 +2586,10 @@ class DequantizeParam : public OpParam {
};
#endif
#if defined(FUSION_DEQUANT_
ADD_BN_OP) ||
\
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||
\
defined(FUSION_DEQUANT_BN_RELU_OP) ||
defined(FUSION_DEQUANT_BN_OP) ||
\
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||
\
#if defined(FUSION_DEQUANT_
BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) ||
\
defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
defined(FUSION_DEQUANT_BN_RELU_OP) ||
\
defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
template
<
typename
Dtype
>
class
FusionDequantBNParam
:
public
DequantizeParam
<
Dtype
>
{
...
...
@@ -2609,10 +2607,6 @@ class FusionDequantBNParam : public DequantizeParam<Dtype> {
bn_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"BNScale"
,
inputs
,
scope
);
bn_bias_
=
OpParam
::
GetVarValue
<
GType
>
(
"BNBias"
,
inputs
,
scope
);
epsilon_
=
OpParam
::
GetAttr
<
float
>
(
"epsilon"
,
attrs
);
// output
if
(
outputs
.
count
(
"Y"
))
{
this
->
output_
=
OpParam
::
OutputYFrom
<
GType
>
(
outputs
,
scope
);
}
}
public:
...
...
@@ -2642,10 +2636,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
// element wise add params
axis_
=
OpParam
::
GetAttr
<
int
>
(
"axis"
,
attrs
);
bias_
=
OpParam
::
InputYFrom
<
GType
>
(
inputs
,
scope
);
// output
if
(
outputs
.
count
(
"Y"
))
{
this
->
output_
=
OpParam
::
OutputYFrom
<
GType
>
(
outputs
,
scope
);
}
}
public:
...
...
@@ -2655,44 +2645,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
};
#endif
#ifdef FUSION_DEQUANT_BN_RELU_OP
template
<
typename
Dtype
>
class
FusionDequantBNReluParam
:
public
FusionDequantBNParam
<
Dtype
>
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
rtype
RType
;
public:
FusionDequantBNReluParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
:
FusionDequantBNParam
<
Dtype
>
(
inputs
,
outputs
,
attrs
,
scope
)
{
// output
if
(
outputs
.
count
(
"Out"
))
{
this
->
output_
=
OpParam
::
OutFrom
<
GType
>
(
outputs
,
scope
);
}
}
};
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
template
<
typename
Dtype
>
class
FusionDequantAddBNReluParam
:
public
FusionDequantAddBNParam
<
Dtype
>
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
rtype
RType
;
public:
FusionDequantAddBNReluParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
:
FusionDequantAddBNParam
<
Dtype
>
(
inputs
,
outputs
,
attrs
,
scope
)
{
// output
if
(
outputs
.
count
(
"Out"
))
{
this
->
output_
=
OpParam
::
OutFrom
<
GType
>
(
outputs
,
scope
);
}
}
};
#endif
#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
template
<
typename
Dtype
>
class
FusionDequantAddBNQuantParam
:
public
FusionDequantAddBNParam
<
Dtype
>
{
...
...
@@ -2707,47 +2659,9 @@ class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
// scale output
online_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"OutScale"
,
outputs
,
scope
);
// offline
if
(
OpParam
::
HasAttr
(
"static_scale"
,
attrs
))
{
is_static_
=
true
;
static_scale_
=
OpParam
::
GetAttr
<
float
>
(
"static_scale"
,
attrs
);
}
// x = round(scale * x)
if
(
OpParam
::
HasAttr
(
"round_type"
,
attrs
))
{
round_type_
=
OpParam
::
GetAttr
<
RoundType
>
(
"round_type"
,
attrs
);
}
}
public:
RType
*
online_scale_
;
// if static scale or not
bool
is_static_
=
false
;
// quantize scale
float
static_scale_
=
1.0
f
;
// round method type
// RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
RoundType
round_type_
=
ROUND_NEAREST_TOWARDS_ZERO
;
};
#endif
#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
template
<
typename
Dtype
>
class
FusionDequantAddBNReluQuantParam
:
public
FusionDequantAddBNReluParam
<
Dtype
>
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
rtype
RType
;
public:
FusionDequantAddBNReluQuantParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
:
FusionDequantAddBNReluParam
<
Dtype
>
(
inputs
,
outputs
,
attrs
,
scope
)
{
// scale output
online_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"OutScale"
,
outputs
,
scope
);
// offline
if
(
OpParam
::
HasAttr
(
"static_scale"
,
attrs
))
{
is_static_
=
true
;
static_scale_
=
OpParam
::
GetAttr
<
float
>
(
"static_scale"
,
attrs
);
if
(
inputs
.
count
(
"InScale"
))
{
offline_
=
true
;
offline_scale_
=
OpParam
::
GetVarValue
<
GType
>
(
"InScale"
,
inputs
,
scope
);
}
// x = round(scale * x)
if
(
OpParam
::
HasAttr
(
"round_type"
,
attrs
))
{
...
...
@@ -2757,10 +2671,10 @@ class FusionDequantAddBNReluQuantParam
public:
RType
*
online_scale_
;
//
if static scale or not
bool
is_static_
=
false
;
//
quantize scale
float
static_scale_
=
1.0
f
;
//
quantize offline scale
RType
*
offline_scale_
;
//
if offine scale or not
bool
offline_
=
false
;
// round method type
// RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
RoundType
round_type_
=
ROUND_NEAREST_TOWARDS_ZERO
;
...
...
test/operators/test_conv_op.cpp
浏览文件 @
f6a7712f
...
...
@@ -129,7 +129,8 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
}
template
<
typename
Itype
,
typename
Otype
,
int
Kernel
,
int
Pad
,
int
Stride
>
int
TestConvOp
(
int
in_channels
,
int
in_height
,
int
in_width
,
int
out_channels
)
{
int
TestConvOp
(
int
in_channels
,
int
in_height
,
int
in_width
,
int
out_channels
,
int
groups
)
{
int
kernel_h
=
Kernel
;
int
kernel_w
=
Kernel
;
int
pad_h
=
Pad
;
...
...
@@ -147,7 +148,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
framework
::
DDim
input_shape
=
framework
::
make_ddim
({
batch_size
,
input_c
,
input_h
,
input_w
});
framework
::
DDim
filter_shape
=
framework
::
make_ddim
({
output_c
,
input_c
,
kernel_h
,
kernel_w
});
framework
::
make_ddim
({
output_c
,
input_c
/
groups
,
kernel_h
,
kernel_w
});
VariableNameMap
inputs
;
VariableNameMap
outputs
;
...
...
@@ -164,13 +165,22 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
auto
filter
=
filter_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
Itype
>
(
filter
,
filter_shape
,
-
20
,
20
);
for
(
int
i
=
0
;
i
<
input
->
numel
();
++
i
)
{
DLOG
<<
"input["
<<
i
<<
"] = "
<<
static_cast
<
int
>
(
input
->
data
<
int8_t
>
()[
i
]);
}
for
(
int
i
=
0
;
i
<
filter
->
numel
();
++
i
)
{
DLOG
<<
"filter["
<<
i
<<
"] = "
<<
static_cast
<
int
>
(
filter
->
data
<
int8_t
>
()[
i
]);
}
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
framework
::
AttributeMap
attrs
;
attrs
[
"strides"
].
Set
<
vector
<
int
>>
(
std
::
vector
<
int
>
({
stride_h
,
stride_w
}));
attrs
[
"paddings"
].
Set
<
vector
<
int
>>
(
std
::
vector
<
int
>
({
pad_h
,
pad_w
}));
attrs
[
"dilations"
].
Set
<
vector
<
int
>>
(
std
::
vector
<
int
>
({
dilation_h
,
dilation_w
}));
attrs
[
"groups"
].
Set
<
int
>
(
1
);
attrs
[
"groups"
].
Set
<
int
>
(
groups
);
auto
*
op
=
new
operators
::
ConvOp
<
CPU
,
float
>
(
"conv2d"
,
inputs
,
outputs
,
attrs
,
scope
);
...
...
@@ -204,15 +214,15 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
Otype
*
output_cmp_data
=
output_cmp
.
data
<
Otype
>
();
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
float
gap
=
output_data
[
i
]
-
output_cmp_data
[
i
];
PADDLE_MOBILE_ENFORCE
(
std
::
abs
(
gap
/
(
output_data
[
i
]
+
1e-5
))
<
1e-3
,
"output[%d] = %d, output_cmp[%d] = %d"
,
i
,
output_data
[
i
],
i
,
output_cmp_data
[
i
]);
//
if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
//
LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
// << ", output_cmp_data[" << i << "] = " <<
//
output_cmp_data[i];
// return 1
;
//
}
//
PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
//
"output[%d] = %d, output_cmp[%d] = %d", i,
//
output_data[i], i, output_cmp_data[i]);
if
(
std
::
abs
(
gap
/
(
output_data
[
i
]
+
1e-5
))
>
1e-3
)
{
LOG
(
kLOG_INFO
)
<<
"output_data["
<<
i
<<
"] = "
<<
output_data
[
i
]
<<
", output_cmp_data["
<<
i
<<
"] = "
<<
output_cmp_data
[
i
];
exit
(
1
)
;
}
}
delete
op
;
return
0
;
...
...
@@ -224,7 +234,8 @@ int main(int argc, char *argv[]) {
if
(
argc
<
5
)
{
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"Usage:
\n
"
<<
" ./test-int8-conv-op in_channels in_height in_width out_channels
\n
"
<<
" ./test-int8-conv-op in_channels in_height in_width out_channels "
"[groups]
\n
"
<<
" params:
\n
"
<<
" -in_channels: int, input image's channels
\n
"
<<
" -in_height: int, input image's height
\n
"
...
...
@@ -236,72 +247,134 @@ int main(int argc, char *argv[]) {
int
in_height
=
atoi
(
argv
[
2
]);
int
in_width
=
atoi
(
argv
[
3
]);
int
out_channels
=
atoi
(
argv
[
4
]);
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=1, stride=1"
;
paddle_mobile
::
TestConvOp
<
float
,
float
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=0, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 1, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=1, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
1
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 3, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=3, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
3
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=0, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=1, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 3, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=3, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
3
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 5, stride = 3
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=5, stride=3"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
5
,
3
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 7, pad = 3, stride = 4
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=7, pad=3, stride=4"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
7
,
3
,
4
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 3, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=3, pad=0, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
int
groups
=
1
;
if
(
argc
==
6
)
{
groups
=
atoi
(
argv
[
5
]);
}
// kernel = 3, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=0, stride=1"
;
paddle_mobile
::
TestConvOp
<
float
,
float
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=3, pad=1, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 1, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=1, stride=1"
;
paddle_mobile
::
TestConvOp
<
float
,
float
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 5, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=5, pad=0, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
5
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 5, pad = 0, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=5, pad=0, stride=1"
;
paddle_mobile
::
TestConvOp
<
float
,
float
,
5
,
0
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 5, pad = 2, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"int8, kernel=5, pad=2, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
5
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
// kernel = 5, pad = 2, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=5, pad=2, stride=1"
;
paddle_mobile
::
TestConvOp
<
float
,
float
,
5
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
1
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 2, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=2, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 5, stride = 1
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=5, stride=1"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
5
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 0, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=0, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
0
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 1, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=1, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
1
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 2, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=2, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
2
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// kernel = 3, pad = 5, stride = 2
LOG
(
paddle_mobile
::
kLOG_INFO
)
<<
"float, kernel=3, pad=5, stride=2"
;
paddle_mobile
::
TestConvOp
<
int8_t
,
int32_t
,
3
,
5
,
2
>
(
in_channels
,
in_height
,
in_width
,
out_channels
,
groups
);
// // kernel = 7, pad = 0, stride = 2
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 1, stride = 2
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 3, stride = 2
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 1, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 3, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 5, stride = 3
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 7, pad = 3, stride = 4
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
// paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 3, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 3, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
// paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
// // kernel = 3, pad = 1, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 3, pad = 1, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
// paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
// // kernel = 5, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 5, pad = 0, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
// paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
// // kernel = 5, pad = 2, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
// paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels,
// in_height,
// in_width,
// out_channels, groups);
// // kernel = 5, pad = 2, stride = 1
// LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
// paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
// in_width, out_channels,
// groups);
}
tools/op.cmake
浏览文件 @
f6a7712f
...
...
@@ -249,6 +249,7 @@ if(NOT FOUND_MATCH)
set
(
SUM_OP ON
)
set
(
QUANT_OP ON
)
set
(
DEQUANT_OP ON
)
set
(
FUSION_DEQUANT_BN_OP ON
)
set
(
FUSION_DEQUANT_ADD_BN_OP ON
)
set
(
FUSION_DEQUANT_BN_RELU_OP ON
)
set
(
FUSION_DEQUANT_ADD_BN_RELU_OP ON
)
...
...
@@ -455,6 +456,9 @@ endif()
if
(
DEQUANT_OP
)
add_definitions
(
-DDEQUANT_OP
)
endif
()
if
(
FUSION_DEQUANT_BN_OP
)
add_definitions
(
-DFUSION_DEQUANT_BN_OP
)
endif
()
if
(
FUSION_DEQUANT_ADD_BN_OP
)
add_definitions
(
-DFUSION_DEQUANT_ADD_BN_OP
)
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录