Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
32edd42b
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
32edd42b
编写于
12月 15, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'ocr_ctc' of
https://github.com/hjchen2/paddle-mobile
into ocr_ctc
上级
9004b4d5
58fedebc
变更
27
显示空白变更内容
内联
并排
Showing
27 changed file
with
277 addition
and
307 deletion
+277
-307
src/operators/fill_constant_op.h
src/operators/fill_constant_op.h
+2
-3
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
...ors/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+4
-16
src/operators/kernel/central-arm-func/conv_add_arm_func.h
src/operators/kernel/central-arm-func/conv_add_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
...ators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+5
-6
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
...erators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+4
-11
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
...perators/kernel/central-arm-func/conv_add_relu_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
...ators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+4
-4
src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
...operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+3
-3
src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
...erators/kernel/central-arm-func/conv_transpose_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
...erators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+3
-3
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/gru_arm_func.h
src/operators/kernel/central-arm-func/gru_arm_func.h
+10
-14
src/operators/kernel/central-arm-func/mul_arm_func.h
src/operators/kernel/central-arm-func/mul_arm_func.h
+2
-36
src/operators/kernel/cl/fusion_fc_kernel.cpp
src/operators/kernel/cl/fusion_fc_kernel.cpp
+2
-10
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+1
-1
src/operators/kernel/mali/mul_kernel.cpp
src/operators/kernel/mali/mul_kernel.cpp
+1
-1
src/operators/math/math_func_neon.h
src/operators/math/math_func_neon.h
+6
-2
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+12
-15
src/operators/math/math_function.h
src/operators/math/math_function.h
+9
-10
src/operators/math/math_function_int8.cpp
src/operators/math/math_function_int8.cpp
+4
-4
src/operators/math/softmax.cpp
src/operators/math/softmax.cpp
+105
-128
src/operators/math/softmax.h
src/operators/math/softmax.h
+5
-1
test/CMakeLists.txt
test/CMakeLists.txt
+2
-5
test/common/test_gemm_perf.cpp
test/common/test_gemm_perf.cpp
+10
-10
test/operators/test_softmax_op.cpp
test/operators/test_softmax_op.cpp
+77
-18
tools/pre-commit.hooks/cpplint.hook
tools/pre-commit.hooks/cpplint.hook
+1
-1
未找到文件。
src/operators/fill_constant_op.h
浏览文件 @
32edd42b
...
...
@@ -25,12 +25,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
using
std
::
string
;
template
<
typename
DeviceType
,
typename
T
>
class
FillConstantOp
:
public
framework
::
OperatorBase
<
DeviceType
>
{
public:
FillConstantOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
FillConstantOp
(
const
st
d
::
st
ring
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
...
...
@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
tensor
->
Resize
(
framework
::
make_ddim
(
param_
.
Shape
()));
tensor
->
mutable_data
(
framework
::
ToTypeIndex
(
data_type
));
math
::
set_c
onstant
(
tensor
,
value
);
math
::
SetC
onstant
(
tensor
,
value
);
}
void
Init
()
{}
...
...
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
...
...
@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias1_slice
=
bias1_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
float
*
biase_data1
=
bias1_slice
.
data
<
float
>
();
// int n = bias1_slice.dims()[0];
// int m = bias1_slice.dims()[1];
// for(int i=0;i<n*m;i++){
// if(biase_data1[i]!=0)
// DLOG<<biase_data1[i]<<",yangfei";
// }
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math
::
matmulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
biase_data1
);
}
}
...
...
@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
// FUSION_CONVADDADDPRELU_OP
src/operators/kernel/central-arm-func/conv_add_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatM
ul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
false
,
biase_data
);
}
...
...
src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -25,6 +25,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
void
ConvAddBNReluBasic
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
...
...
@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
)
,
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
template
<
typename
P
>
void
ConvAddBNReluCompute
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
Tensor
Bias
;
...
...
@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> ¶m) {
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
3
&&
param
.
Strides
()[
0
]
==
2
)
{
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math
::
DepthwiseConvAddBNRelu3x3s2p1v2
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
true
);
...
...
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
...
...
@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
// DLOG<<"yangfei";
// DLOG<<bias.dims();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
float
*
biase_data
=
bias
.
data
<
float
>
();
...
...
@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math
::
matmulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
nullptr
);
}
}
...
...
@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
// FUSION_CONVADDPRELU_OP
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
alpha
,
math
::
MatM
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
alpha
,
&
out_slice
,
beta
,
true
,
bias_data
);
}
}
...
...
src/operators/kernel/central-arm-func/conv_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> ¶m) {
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatM
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
false
,
static_cast
<
Otype
*>
(
nullptr
));
...
...
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias_data
=
bias_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
&
new_bias
,
g
,
bias_data
.
data
<
float
>
());
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
&
new_bias
,
g
,
bias_data
.
data
<
float
>
());
}
}
}
...
...
src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
)
,
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
...
...
src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> ¶m) {
Tensor
filter_slice
=
filter
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
Tensor
out_slice
=
output_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
P
,
P
>
(
filter_slice
,
true
,
in_slice
,
false
,
math
::
MatM
ul
<
P
,
P
>
(
filter_slice
,
true
,
in_slice
,
false
,
static_cast
<
P
>
(
1.0
),
&
col_matrix
,
static_cast
<
P
>
(
0.0
));
if
(
data_dim
==
2U
)
{
col2im
(
col
,
dilations
,
strides
,
...
...
src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> ¶m) {
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
)
,
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
...
...
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> ¶m) {
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
Otype
)
*
classes
);
}
math
::
matm
ul
<
Itype
,
Otype
>
(
x_matrix
,
false
,
y_matrix
,
false
,
math
::
MatM
ul
<
Itype
,
Otype
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
false
);
}
...
...
src/operators/kernel/central-arm-func/gru_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -25,18 +25,16 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
Device
,
typename
T
>
inline
void
ReorderInitState
(
const
framework
::
Tensor
&
src
,
std
::
vector
<
size_t
>
index_lod
,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
Device
Type
,
T
>
row_shuffle
;
math
::
CopyMatrixRowsFunctor
<
Device
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
());
row_shuffle
(
src
,
index_lod
,
dst
,
indexed_src
);
}
template
<
typename
P
>
template
<
typename
T
>
void
GruCompute
(
const
GruParam
<
CPU
>&
param
)
{
auto
*
input
=
param
.
InputInput
();
auto
*
h0
=
param
.
InputH0
();
...
...
@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) {
bool
is_reverse
=
param
.
IsReverse
();
math
::
LoDTensor2BatchFunctor
<
CPU
,
float
>
to_batch
;
to_batch
(
*
input
,
batch_gate
,
true
,
is_reverse
);
// math::ClearTensor<CPU, float> clearTensor;
// clearTensor(batch_gate);
if
(
bias
)
{
math
::
RowwiseAdd
<
CPU
,
float
>
add_bias
;
add_bias
(
*
batch_gate
,
*
bias
,
batch_gate
);
...
...
@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) {
gru_value
.
gate_weight
=
const_cast
<
float
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
float
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Tensor
ordered_h0
;
std
::
vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
...
...
@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) {
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
// BUG
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
framework
::
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
framework
::
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
framework
::
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
float
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
float
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
float
>
();
...
...
@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) {
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
// GRU_OP
src/operators/kernel/central-arm-func/mul_arm_func.h
浏览文件 @
32edd42b
...
...
@@ -19,40 +19,6 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
// 1、如果x,y维度都是2维,
// x = [[1,2], y = [[5,6],
// [3,4]] [7,8]]
// 运算结果为正常矩阵相乘。结果 out =
// [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
//
// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
// x = [[[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]],
// [[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]]]
// y = [[[1,2]],
// [[3,4]],
// [[5,6]],
// [[7,8]]]
// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘,得到6,
// [x_num_col_dims,xdim.size())部分4相乘,得到4,
// 将Tensor x的dims重写成(6,4)
// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘,得到4,
// [y_num_col_dims,ydim.size())部分1,2相乘,得到2,
// 将Tensor y的dims重写成(4,2)
// 并不影响x,y在内存中的分布。
// x = [[1,2,3,4], y = [[1,2],
// [2,3,4,5], [3,4],
// [3,4,5,6], 矩阵乘法 [5,6],
// [1,2,3,4], [7,8]]
// [2,3,4,5],
// [3,4,5,6]]
// 结果x(6行4列)乘y(4行2列),按1中矩阵相乘,结果out(6行2列)
template
<
typename
P
>
void
MulCompute
(
const
MulParam
<
CPU
>
&
param
)
{
const
Tensor
*
input_x
=
param
.
InputX
();
...
...
@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> ¶m) {
}
if
(
param
.
InputX
()
->
type
()
==
typeid
(
int8_t
))
{
out
->
mutable_data
<
int32_t
>
();
math
::
matm
ul
<
int8_t
,
int32_t
>
(
x_matrix
,
false
,
y_matrix
,
false
,
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
}
else
{
out
->
mutable_data
<
float
>
();
math
::
matm
ul
<
float
,
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
math
::
MatM
ul
<
float
,
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
}
...
...
src/operators/kernel/cl/fusion_fc_kernel.cpp
浏览文件 @
32edd42b
...
...
@@ -94,27 +94,19 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> ¶m, cl_context context,
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
// for (int i = 0; i < out->numel(); i++) {
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
math
::
MatMul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
false
);
out_image
->
InitEmptyImage
(
context
,
commandQueue
,
out
->
dims
());
framework
::
TensorToCLImage
(
out
,
out_image
,
context
,
commandQueue
,
kernel1
);
DLOG
<<
*
out
;
delete
(
input_x
);
delete
(
input_y
);
delete
(
input_z
);
delete
(
out
);
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
template
<
>
void
FusionFcKernel
<
GPU_CL
,
float
>::
Compute
(
const
FusionFcParam
<
GPU_CL
>
&
param
)
{
...
...
src/operators/kernel/mali/fushion_fc_kernel.cpp
浏览文件 @
32edd42b
...
...
@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute(
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
DLOG
<<
out_data
[
i
];
}
math
::
matm
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
math
::
MatM
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
));
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
...
...
src/operators/kernel/mali/mul_kernel.cpp
浏览文件 @
32edd42b
...
...
@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> ¶m) {
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
math
::
matm
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
math
::
MatM
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
(
out_dim
);
...
...
src/operators/math/math_func_neon.h
浏览文件 @
32edd42b
...
...
@@ -38,7 +38,11 @@ limitations under the License. */
*
* (this is the zlib license)
*/
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#pragma once
#include <arm_neon.h>
#define c_inv_mant_mask ~0x7f800000u
...
...
@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) {
static
inline
float32x4_t
div_ps
(
float32x4_t
a
,
float32x4_t
b
)
{
float32x4_t
reciprocal
=
vrecpeq_f32
(
b
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
b
,
reciprocal
),
reciprocal
);
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return
vmulq_f32
(
a
,
reciprocal
);
}
static
inline
float32x4_t
pow_ps
(
float32x4_t
a
,
float32x4_t
b
)
{
// pow(x, m) = exp(m * log(x))
return
exp_ps
(
vmulq_f32
(
b
,
log_ps
(
a
)));
}
#endif // __ARM_NEON__
src/operators/math/math_function.cpp
浏览文件 @
32edd42b
...
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/math_function.h"
#include <cstring>
#include <string>
#include "common/enforce.h"
#include "framework/data_type.h"
#include "framework/tensor.h"
#include "operators/math/gemm.h"
...
...
@@ -35,13 +35,13 @@ struct TensorSetConstant {
float
value_
;
};
void
set_c
onstant
(
framework
::
Tensor
*
tensor
,
float
value
)
{
void
SetC
onstant
(
framework
::
Tensor
*
tensor
,
float
value
)
{
framework
::
VisitDataType
(
framework
::
ToDataType
(
tensor
->
type
()),
TensorSetConstant
(
tensor
,
value
));
}
template
<
>
void
matm
ul
<
float
,
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
<
float
,
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
float
*
bias
)
{
...
...
@@ -50,7 +50,7 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
...
...
@@ -72,7 +72,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
}
#ifdef _OPENMP
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#else
...
...
@@ -92,19 +91,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
}
}
template
<
>
void
matmulWithBn
<
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
,
float
*
bias
)
{
void
MatMulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
,
float
*
bias
)
{
Gemm
gemm
;
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
...
...
@@ -122,7 +120,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#endif
}
void
matm
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
...
...
@@ -132,7 +130,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
...
...
@@ -146,7 +144,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
gemm
.
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#endif
}
...
...
src/operators/math/math_function.h
浏览文件 @
32edd42b
...
...
@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include <cmath>
#include <string>
#include "framework/tensor.h"
...
...
@@ -22,37 +21,37 @@ namespace paddle_mobile {
namespace
operators
{
namespace
math
{
void
set_c
onstant
(
framework
::
Tensor
*
tensor
,
float
value
);
void
SetC
onstant
(
framework
::
Tensor
*
tensor
,
float
value
);
template
<
typename
Itype
,
typename
Otype
>
void
matm
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
=
false
,
Otype
*
bias
=
nullptr
);
template
<
typename
Itype
,
typename
Otype
>
void
matm
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
Otype
*
bias
,
bool
addOnRow
);
template
<
typename
T
>
void
matmulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatMulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
,
T
*
bias
=
nullptr
);
int
group
,
float
*
bias
=
nullptr
);
void
matm
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
Device
,
typename
T
>
struct
ClearTensor
{
void
operator
()(
framework
::
Tensor
*
tensor
);
};
template
<
typename
Device
Type
,
typename
T
>
template
<
typename
Device
,
typename
T
>
struct
RowwiseAdd
{
void
operator
()(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
output
);
...
...
src/operators/math/math_function_int8.cpp
浏览文件 @
32edd42b
...
...
@@ -22,7 +22,7 @@ namespace operators {
namespace
math
{
template
<
>
void
matm
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
,
...
...
@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int32_t
M
=
dim_out
[
0
];
int32_t
N
=
dim_out
[
1
];
...
...
@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
}
template
<
>
void
matm
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
)
{
matm
ul
<
int8_t
,
int32_t
>
(
matrix_a
,
trans_a
,
matrix_b
,
trans_b
,
alpha
,
MatM
ul
<
int8_t
,
int32_t
>
(
matrix_a
,
trans_a
,
matrix_b
,
trans_b
,
alpha
,
matrix_out
,
beta
,
relu
,
bias
,
false
);
}
...
...
src/operators/math/softmax.cpp
浏览文件 @
32edd42b
...
...
@@ -15,154 +15,131 @@ limitations under the License. */
#ifdef SOFTMAX_OP
#include "operators/math/softmax.h"
#include "common/types.h"
#ifdef __ARM_NEON
#include <math.h>
#include <algorithm>
#include <limits>
#include "common/types.h"
#include "operators/math/math_func_neon.h"
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
DDim
;
using
framework
::
Tensor
;
template
<
typename
T
>
class
SoftmaxFuntor
<
CPU
,
T
>
{
#ifdef __ARM_NEON
void
sum
(
float
*
input
,
float
*
sumptr
,
int
inner_size
,
int
outter_size
)
{
float32x4_t
acc
=
vdupq_n_f32
(
0
);
float
sum_
=
0
;
for
(
int
i
=
0
;
i
<
outter_size
;
++
i
)
{
float
*
input_outer_ptr
=
input
+
i
*
inner_size
;
int
nn
=
inner_size
>>
2
;
int
left
=
inner_size
-
(
nn
<<
2
);
for
(;
nn
>
0
;
nn
--
)
{
float32x4_t
vec_input
=
vld1q_f32
(
input_outer_ptr
);
acc
=
vaddq_f32
(
acc
,
vec_input
);
input_outer_ptr
+=
4
;
}
float32x2_t
vsum_
=
vadd_f32
(
vget_high_f32
(
acc
),
vget_low_f32
(
acc
));
sum_
=
vget_lane_f32
(
vsum_
,
0
)
+
vget_lane_f32
(
vsum_
,
1
);
for
(;
left
>
0
;
left
--
)
{
sum_
+=
*
input_outer_ptr
;
input_outer_ptr
++
;
}
}
for
(
int
j
=
0
;
j
<
inner_size
*
outter_size
;
++
j
)
{
sumptr
[
j
]
=
sum_
;
}
}
void
SoftmaxCacl
(
const
Tensor
*
X
,
Tensor
*
Y
)
{
const
float
*
input
=
X
->
data
<
float
>
();
const
DDim
&
dDim
=
X
->
dims
();
int
axis_index
=
1
;
if
(
dDim
.
size
()
<
4
)
{
axis_index
=
0
;
}
DDim
outer_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
0
,
axis_index
+
1
);
DDim
inner_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
axis_index
+
1
,
dDim
.
size
());
int
out_size
=
paddle_mobile
::
framework
::
product
(
outer_ddim
);
int
inner_size
=
paddle_mobile
::
framework
::
product
(
inner_ddim
);
auto
*
max_ptr
=
new
float
[
inner_size
*
out_size
];
// max
for
(
int
j
=
0
;
j
<
out_size
;
++
j
)
{
const
float
*
input_outer_ptr
=
input
+
j
*
inner_size
;
float
*
max_outer_ptr
=
max_ptr
+
j
*
inner_size
;
float
max_
=
0
;
for
(
int
i
=
0
;
i
<
inner_size
;
++
i
)
{
const
float
*
input_inner_ptr
=
input_outer_ptr
+
i
;
max_
=
std
::
max
(
max_
,
input_inner_ptr
[
0
]);
}
for
(
int
k
=
0
;
k
<
inner_size
;
++
k
)
{
max_outer_ptr
[
k
]
=
max_
;
}
}
// exp(value - max)
float
*
exp_sub_max
=
new
float
[
inner_size
*
out_size
];
float
*
exp_sub_max_ptr
=
&
exp_sub_max
[
0
];
for
(
int
l
=
0
;
l
<
out_size
;
++
l
)
{
const
float
*
input_outer_ptr
=
input
+
l
*
inner_size
;
float
*
max_outer_ptr
=
max_ptr
+
l
*
inner_size
;
int
nn
=
inner_size
>>
2
;
int
left
=
inner_size
-
(
nn
<<
2
);
for
(;
nn
>
0
;
nn
--
)
{
float32x4_t
vec_input
=
vld1q_f32
(
input_outer_ptr
);
float32x4_t
vec_max
=
vld1q_f32
(
max_outer_ptr
);
float32x4_t
vec_sub
=
vsubq_f32
(
vec_input
,
vec_max
);
float32x4_t
vec_exp
=
exp_ps
(
vec_sub
);
vst1q_f32
(
exp_sub_max_ptr
,
vec_exp
);
input_outer_ptr
+=
4
;
max_outer_ptr
+=
4
;
exp_sub_max_ptr
+=
4
;
}
for
(;
left
>
0
;
left
--
)
{
*
exp_sub_max_ptr
=
expf
(
*
input_outer_ptr
-
*
max_outer_ptr
);
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#ifndef __aarch64__
inline
float32_t
vmaxvq_f32
(
const
float32x4_t
&
r
)
{
float32x2_t
v
=
vmax_f32
(
vget_high_f32
(
r
),
vget_low_f32
(
r
));
return
vget_lane_f32
(
vpmax_f32
(
v
,
v
),
0
);
}
input_outer_ptr
++
;
max_outer_ptr
++
;
exp_sub_max_ptr
++
;
}
}
float
*
sumptr
=
new
float
[
inner_size
*
out_size
];
// sum exp
sum
(
exp_sub_max
,
sumptr
,
inner_size
,
out_size
);
// div
auto
*
out_ptr
=
Y
->
mutable_data
<
float
>
();
for
(
int
l
=
0
;
l
<
out_size
;
++
l
)
{
const
float
*
input_outer_ptr
=
exp_sub_max
+
l
*
inner_size
;
float
*
output_outer_ptr
=
out_ptr
+
l
*
inner_size
;
float
*
sum_outer_ptr
=
sumptr
+
l
*
inner_size
;
int
nn
=
inner_size
>>
2
;
int
left
=
inner_size
-
(
nn
<<
2
);
for
(;
nn
>
0
;
nn
--
)
{
float32x4_t
vec_input
=
vld1q_f32
(
input_outer_ptr
);
float32x4_t
vec_sum
=
vld1q_f32
(
sum_outer_ptr
);
float32x4_t
vec_div
=
div_ps
(
vec_input
,
vec_sum
);
vst1q_f32
(
output_outer_ptr
,
vec_div
);
input_outer_ptr
+=
4
;
output_outer_ptr
+=
4
;
sum_outer_ptr
+=
4
;
inline
float32_t
vaddvq_f32
(
const
float32x4_t
&
r
)
{
float32x2_t
v
=
vadd_f32
(
vget_high_f32
(
r
),
vget_low_f32
(
r
))
;
return
vget_lane_f32
(
vpadd_f32
(
v
,
v
),
0
)
;
}
#endif // __aarch64__
#endif // __ARM_NEON__
float
find_max
(
const
float
*
input
,
const
int
num_classes
)
{
int
remain
=
num_classes
;
float
max
=
-
std
::
numeric_limits
<
float
>::
max
();
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
int
loop
=
num_classes
>>
3
;
remain
=
num_classes
&
0x7
;
float32x4_t
__max
=
vdupq_n_f32
(
max
)
;
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
input
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
input
);
float32x4_t
x1
=
vld1q_f32
(
input
+
4
);
__max
=
vmaxq_f32
(
x0
,
__max
);
__max
=
vmaxq_f32
(
x1
,
__max
);
}
max
=
vmaxvq_f32
(
__max
);
#endif
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
max
=
std
::
max
(
max
,
input
[
i
])
;
}
for
(;
left
>
0
;
left
--
)
{
*
output_outer_ptr
=
(
*
input_outer_ptr
)
/
(
*
sum_outer_ptr
);
input_outer_ptr
++
;
output_outer_ptr
++
;
sum_outer_ptr
++
;
return
max
;
}
template
<
>
void
SoftmaxFuntor
<
CPU
,
float
>::
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
const
framework
::
DDim
&
dims
=
X
->
dims
();
int
batch_size
=
dims
[
0
];
int
num_classes
=
dims
[
dims
.
size
()
-
1
];
int
channels
=
X
->
numel
()
/
batch_size
/
num_classes
;
const
float
*
x
=
X
->
data
<
float
>
();
float
*
y
=
Y
->
mutable_data
<
float
>
();
#pragma omp parallel for collapse(2)
for
(
int
batch
=
0
;
batch
<
X
->
dims
()[
0
];
++
batch
)
{
for
(
int
channel
=
0
;
channel
<
channels
;
++
channel
)
{
size_t
offset
=
(
batch
*
channels
+
channel
)
*
num_classes
;
const
float
*
input
=
x
+
offset
;
float
*
output
=
y
+
offset
;
// find max
float
max
=
find_max
(
input
,
num_classes
);
// exp(x - max)
int
remain
=
num_classes
;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
int
loop
=
num_classes
>>
3
;
remain
=
num_classes
&
0x7
;
float32x4_t
__max
=
vdupq_n_f32
(
max
);
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
input
+=
8
,
output
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
input
);
float32x4_t
x1
=
vld1q_f32
(
input
+
4
);
x0
=
vsubq_f32
(
x0
,
__max
);
x1
=
vsubq_f32
(
x1
,
__max
);
x0
=
exp_ps
(
x0
);
x1
=
exp_ps
(
x1
);
vst1q_f32
(
output
,
x0
);
vst1q_f32
(
output
+
4
,
x1
);
}
#endif // __ARM_NEON__
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
output
[
i
]
=
std
::
expf
(
input
[
i
]
-
max
);
}
// sum(exp(x - max))
float
sum
=
0.
f
;
output
=
y
+
offset
;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t
__sum
=
vdupq_n_f32
(
0.
f
);
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
output
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
output
);
float32x4_t
x1
=
vld1q_f32
(
output
+
4
);
__sum
=
vaddq_f32
(
x0
,
__sum
);
__sum
=
vaddq_f32
(
x1
,
__sum
);
}
sum
+=
vaddvq_f32
(
__sum
);
#endif // __ARM_NEON__
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
sum
+=
output
[
i
];
}
// exp(x - max) / sum
float
inv_sum
=
1.
f
/
sum
;
output
=
y
+
offset
;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t
__inv_sum
=
vdupq_n_f32
(
inv_sum
);
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
output
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
output
);
float32x4_t
x1
=
vld1q_f32
(
output
+
4
);
x0
=
vmulq_f32
(
x0
,
__inv_sum
);
x1
=
vmulq_f32
(
x1
,
__inv_sum
);
vst1q_f32
(
output
,
x0
);
vst1q_f32
(
output
+
4
,
x0
);
}
#else
#endif // ARM_NEON
public:
void
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
const
DDim
dDim
=
X
->
dims
();
int
dim1
=
dDim
[
dDim
.
size
()
-
1
];
int
dim0
=
X
->
numel
()
/
dim1
/
dDim
[
0
];
framework
::
DDim
matrix_shape
=
{
dim0
,
dim1
};
for
(
int
i
=
0
;
i
<
dDim
[
0
];
++
i
)
{
framework
::
Tensor
sub_X
=
X
->
Slice
(
i
,
i
+
1
);
framework
::
Tensor
sub_Y
=
Y
->
Slice
(
i
,
i
+
1
);
sub_X
.
Resize
(
matrix_shape
);
sub_Y
.
Resize
(
matrix_shape
);
for
(
int
j
=
0
;
j
<
dim0
;
j
++
)
{
framework
::
Tensor
sub_x
=
sub_X
.
Slice
(
j
,
j
+
1
);
framework
::
Tensor
sub_y
=
sub_Y
.
Slice
(
j
,
j
+
1
);
#ifdef __ARM_NEON
SoftmaxCacl
(
&
sub_x
,
&
sub_y
);
#endif
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
output
[
i
]
*=
inv_sum
;
}
}
}
};
template
class
SoftmaxFuntor
<
CPU
,
float
>;
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif // SOFTMAX_OP
src/operators/math/softmax.h
浏览文件 @
32edd42b
...
...
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
template
<
typename
Device
Type
,
typename
T
>
template
<
typename
Device
,
typename
T
>
class
SoftmaxFuntor
{
public:
void
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
...
...
test/CMakeLists.txt
浏览文件 @
32edd42b
...
...
@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE
(
test-inference-api framework/test_inference_api.cpp
)
target_link_libraries
(
test-inference-api paddle-mobile
)
# gen test log
# gen test
ADD_EXECUTABLE
(
test-optimize framework/test_optimize.cpp
)
target_link_libraries
(
test-optimize paddle-mobile
)
#gen test
ADD_EXECUTABLE
(
test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-pool-op paddle-mobile
)
#gen test
ADD_EXECUTABLE
(
test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-softmax paddle-mobile
)
ADD_EXECUTABLE
(
test-softmax
-op
operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-softmax
-op
paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-gemm-accuracy common/test_gemm_accuracy.cpp
)
...
...
test/common/test_gemm_perf.cpp
浏览文件 @
32edd42b
...
...
@@ -73,14 +73,14 @@ int main() {
// float
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
float
,
float
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
float
,
float
>
(
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
false
,
nullptr
);
}
auto
time_start0
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
float
,
float
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
float
,
float
>
(
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
false
,
nullptr
);
}
...
...
@@ -91,14 +91,14 @@ int main() {
// int8_t without bias
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
static_cast
<
float
>
(
0
));
}
auto
time_start1
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
static_cast
<
float
>
(
0
));
}
...
...
@@ -109,13 +109,13 @@ int main() {
// int8_t with bias, column element wise add
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
auto
time_start2
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
...
...
@@ -126,13 +126,13 @@ int main() {
// int8_t with bias, row element wise add
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
auto
time_start3
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
...
...
@@ -143,13 +143,13 @@ int main() {
// int8_t with bias&relu
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data_col
,
false
);
}
auto
time_start4
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data_col
,
false
);
}
...
...
test/operators/test_softmax_op.cpp
浏览文件 @
32edd42b
...
...
@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include <limits>
#include "../test_include.h"
#include "operators/softmax_op.h"
int
main
()
{
paddle_mobile
::
framework
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
auto
program
=
loader
.
Load
(
std
::
string
(
g_mobilenet
));
if
(
program
.
originProgram
==
nullptr
)
{
DLOG
<<
"program read file"
;
namespace
paddle_mobile
{
void
Softmax
(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
const
framework
::
DDim
&
dims
=
X
->
dims
();
int
batch_size
=
dims
[
0
];
int
num_classes
=
dims
[
dims
.
size
()
-
1
];
int
channels
=
X
->
numel
()
/
batch_size
/
num_classes
;
const
float
*
x
=
X
->
data
<
float
>
();
float
*
y
=
Y
->
mutable_data
<
float
>
();
for
(
int
batch
=
0
;
batch
<
batch_size
;
++
batch
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
size_t
offset
=
(
batch
*
channels
+
c
)
*
num_classes
;
const
float
*
input
=
x
+
offset
;
float
*
output
=
y
+
offset
;
float
max
=
-
std
::
numeric_limits
<
float
>::
max
();
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
max
=
(
input
[
j
]
>
max
)
?
input
[
j
]
:
max
;
}
float
sum
=
0.
f
;
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
float
tmp
=
std
::
expf
(
input
[
j
]
-
max
);
sum
+=
tmp
;
output
[
j
]
=
tmp
;
}
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
output
[
j
]
/=
sum
;
}
}
}
}
int
TestSoftmaxOp
(
const
std
::
vector
<
int
>
input_shape
)
{
framework
::
DDim
dims
=
framework
::
make_ddim
(
input_shape
);
VariableNameMap
inputs
;
VariableNameMap
outputs
;
auto
scope
=
std
::
make_shared
<
framework
::
Scope
>
();
inputs
[
"X"
]
=
std
::
vector
<
std
::
string
>
({
"input"
});
outputs
[
"Out"
]
=
std
::
vector
<
std
::
string
>
({
"output"
});
auto
input_var
=
scope
.
get
()
->
Var
(
"input"
);
auto
input
=
input_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
float
>
(
input
,
dims
,
-
100.0
,
100.0
);
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
framework
::
AttributeMap
attrs
;
auto
*
op
=
new
operators
::
SoftmaxOp
<
CPU
,
float
>
(
"softmax"
,
inputs
,
outputs
,
attrs
,
scope
);
op
->
InferShape
();
op
->
Init
();
op
->
Run
();
framework
::
Tensor
output_cmp
;
float
*
output_cmp_data
=
output_cmp
.
mutable_data
<
float
>
(
output
->
dims
());
Softmax
(
input
,
&
output_cmp
);
const
float
*
output_data
=
output
->
data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
float
gap
=
output_data
[
i
]
-
output_cmp_data
[
i
];
if
(
std
::
abs
(
gap
/
(
output_data
[
i
]
+
1e-5
))
>
1e-3
)
{
LOG
(
kLOG_INFO
)
<<
"output_data["
<<
i
<<
"] = "
<<
output_data
[
i
]
<<
", output_cmp_data["
<<
i
<<
"] = "
<<
output_cmp_data
[
i
];
delete
op
;
exit
(
1
);
}
Executor4Test
<
paddle_mobile
::
CPU
,
paddle_mobile
::
operators
::
SoftmaxOp
<
paddle_mobile
::
CPU
,
float
>>
executor
(
program
,
"softmax"
);
paddle_mobile
::
framework
::
Tensor
input
;
SetupTensor
<
float
>
(
&
input
,
{
1
,
1000
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
auto
out_ddim
=
paddle_mobile
::
framework
::
make_ddim
({
1
,
1000
});
auto
output
=
executor
.
Predict
(
input
,
"reshape_0.tmp_0"
,
"softmax_0.tmp_0"
,
out_ddim
);
auto
*
output_ptr
=
output
->
data
<
float
>
();
for
(
int
j
=
0
;
j
<
output
->
numel
();
++
j
)
{
DLOG
<<
" value of output: "
<<
output_ptr
[
j
];
}
delete
op
;
return
0
;
}
}
// namespace paddle_mobile
int
main
(
int
argc
,
char
*
argv
[])
{
TestSoftmaxOp
({
128
,
1000
});
TestSoftmaxOp
({
128
,
10
,
1000
});
return
0
;
}
tools/pre-commit.hooks/cpplint.hook
浏览文件 @
32edd42b
...
...
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for
file
in
$(
git diff
--cached
--name-status
|
awk
'$1 != "D" {print $2}'
|
\
grep
-v
".pb.cpp"
|
grep
-v
".pb.h"
|
grep
-v
".pb-c.h"
|
grep
-v
".pb-c.c"
|
\
grep
-v
"protobuf-c.h"
|
grep
-v
"protobuf-c.c"
|
grep
-v
"paddle_mobile_jni.cpp"
)
;
do
grep
-v
"protobuf-c.h"
|
grep
-v
"protobuf-c.c"
)
;
do
cpplint
$file
;
TOTAL_ERRORS
=
$(
expr
$TOTAL_ERRORS
+
$?
)
;
done
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录