Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
c5f70926
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c5f70926
编写于
12月 15, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'ocr_ctc' of
https://github.com/hjchen2/paddle-mobile
into ocr_ctc
上级
9729edac
3793beef
变更
28
显示空白变更内容
内联
并排
Showing
28 changed file
with
279 addition
and
309 deletion
+279
-309
src/framework/executor.cpp
src/framework/executor.cpp
+2
-2
src/operators/fill_constant_op.h
src/operators/fill_constant_op.h
+2
-3
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
...ors/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+4
-16
src/operators/kernel/central-arm-func/conv_add_arm_func.h
src/operators/kernel/central-arm-func/conv_add_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
...ators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+5
-6
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
...erators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+4
-11
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
...perators/kernel/central-arm-func/conv_add_relu_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
...ators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+4
-4
src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
...operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+3
-3
src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
...erators/kernel/central-arm-func/conv_transpose_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
...erators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+3
-3
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+1
-1
src/operators/kernel/central-arm-func/gru_arm_func.h
src/operators/kernel/central-arm-func/gru_arm_func.h
+10
-14
src/operators/kernel/central-arm-func/mul_arm_func.h
src/operators/kernel/central-arm-func/mul_arm_func.h
+2
-36
src/operators/kernel/cl/fusion_fc_kernel.cpp
src/operators/kernel/cl/fusion_fc_kernel.cpp
+2
-10
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+1
-1
src/operators/kernel/mali/mul_kernel.cpp
src/operators/kernel/mali/mul_kernel.cpp
+1
-1
src/operators/math/math_func_neon.h
src/operators/math/math_func_neon.h
+6
-2
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+12
-15
src/operators/math/math_function.h
src/operators/math/math_function.h
+9
-10
src/operators/math/math_function_int8.cpp
src/operators/math/math_function_int8.cpp
+4
-4
src/operators/math/softmax.cpp
src/operators/math/softmax.cpp
+105
-128
src/operators/math/softmax.h
src/operators/math/softmax.h
+5
-1
test/CMakeLists.txt
test/CMakeLists.txt
+2
-5
test/common/test_gemm_perf.cpp
test/common/test_gemm_perf.cpp
+10
-10
test/operators/test_softmax_op.cpp
test/operators/test_softmax_op.cpp
+77
-18
tools/pre-commit.hooks/cpplint.hook
tools/pre-commit.hooks/cpplint.hook
+1
-1
未找到文件。
src/framework/executor.cpp
浏览文件 @
c5f70926
...
@@ -350,7 +350,7 @@ PMStatus Executor<Device, T>::Predict() {
...
@@ -350,7 +350,7 @@ PMStatus Executor<Device, T>::Predict() {
_tp
[
ops_list_
[
i
]
->
Type
()]
+=
timeCost
;
_tp
[
ops_list_
[
i
]
->
Type
()]
+=
timeCost
;
}
}
}
}
DLOG
<<
"====================[ profile ]======================"
;
printf
(
"====================[ profile ]======================
\n
"
)
;
typedef
std
::
pair
<
std
::
string
,
uint64_t
>
prof_t
;
typedef
std
::
pair
<
std
::
string
,
uint64_t
>
prof_t
;
std
::
vector
<
prof_t
>
_tv
(
_tp
.
begin
(),
_tp
.
end
());
std
::
vector
<
prof_t
>
_tv
(
_tp
.
begin
(),
_tp
.
end
());
uint64_t
_ptotal
=
0
;
uint64_t
_ptotal
=
0
;
...
@@ -367,7 +367,7 @@ PMStatus Executor<Device, T>::Predict() {
...
@@ -367,7 +367,7 @@ PMStatus Executor<Device, T>::Predict() {
static_cast
<
float
>
(
p
.
second
),
static_cast
<
float
>
(
p
.
second
),
static_cast
<
float
>
(
p
.
second
)
/
_ptotal
*
100.0
);
static_cast
<
float
>
(
p
.
second
)
/
_ptotal
*
100.0
);
}
}
DLOG
<<
"====================[---------]======================"
;
printf
(
"====================[---------]======================
\n
"
)
;
#endif
#endif
return
PMSuccess
;
return
PMSuccess
;
}
}
...
...
src/operators/fill_constant_op.h
浏览文件 @
c5f70926
...
@@ -25,12 +25,11 @@ limitations under the License. */
...
@@ -25,12 +25,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
using
std
::
string
;
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
DeviceType
,
typename
T
>
class
FillConstantOp
:
public
framework
::
OperatorBase
<
DeviceType
>
{
class
FillConstantOp
:
public
framework
::
OperatorBase
<
DeviceType
>
{
public:
public:
FillConstantOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
FillConstantOp
(
const
st
d
::
st
ring
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
attrs
,
const
framework
::
AttributeMap
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
...
@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
...
@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
tensor
->
Resize
(
framework
::
make_ddim
(
param_
.
Shape
()));
tensor
->
Resize
(
framework
::
make_ddim
(
param_
.
Shape
()));
tensor
->
mutable_data
(
framework
::
ToTypeIndex
(
data_type
));
tensor
->
mutable_data
(
framework
::
ToTypeIndex
(
data_type
));
math
::
set_c
onstant
(
tensor
,
value
);
math
::
SetC
onstant
(
tensor
,
value
);
}
}
void
Init
()
{}
void
Init
()
{}
...
...
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
浏览文件 @
c5f70926
...
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#pragma once
#include <string>
#include <vector>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
...
@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
...
@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias1_slice
=
bias1_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias1_slice
=
bias1_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
float
*
biase_data1
=
bias1_slice
.
data
<
float
>
();
float
*
biase_data1
=
bias1_slice
.
data
<
float
>
();
// int n = bias1_slice.dims()[0];
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
// int m = bias1_slice.dims()[1];
// for(int i=0;i<n*m;i++){
// if(biase_data1[i]!=0)
// DLOG<<biase_data1[i]<<",yangfei";
// }
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math
::
matmulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
biase_data1
);
p
,
mode
,
biase_data
,
biase_data1
);
}
}
}
}
...
@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
...
@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
// FUSION_CONVADDADDPRELU_OP
src/operators/kernel/central-arm-func/conv_add_arm_func.h
浏览文件 @
c5f70926
...
@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
...
@@ -107,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
// gemm
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatM
ul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
false
,
biase_data
);
static_cast
<
float
>
(
1
),
false
,
biase_data
);
}
}
...
...
src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
浏览文件 @
c5f70926
...
@@ -25,6 +25,7 @@ limitations under the License. */
...
@@ -25,6 +25,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
void
ConvAddBNReluBasic
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
void
ConvAddBNReluBasic
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
filter
=
*
param
.
Filter
();
...
@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> ¶m) {
...
@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
)
,
static_cast
<
float
>
(
1
),
&
out_slice
,
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
}
}
}
template
<
typename
P
>
template
<
typename
P
>
void
ConvAddBNReluCompute
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
void
ConvAddBNReluCompute
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
Tensor
Bias
;
Tensor
Bias
;
...
@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> ¶m) {
...
@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> ¶m) {
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
3
&&
param
.
Strides
()[
0
]
==
2
)
{
param
.
Filter
()
->
dims
()[
2
]
==
3
&&
param
.
Strides
()[
0
]
==
2
)
{
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math
::
DepthwiseConvAddBNRelu3x3s2p1v2
(
param
.
Input
(),
param
.
Filter
(),
math
::
DepthwiseConvAddBNRelu3x3s2p1v2
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
param
.
NewScale
(),
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
true
);
param
.
NewBias
(),
true
);
...
...
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
浏览文件 @
c5f70926
...
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#pragma once
#include <string>
#include <vector>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
...
@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
...
@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
bias
=
*
param
.
Bias
();
// DLOG<<"yangfei";
// DLOG<<bias.dims();
int
axis
=
param
.
Axis
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
output
=
param
.
Output
();
float
*
biase_data
=
bias
.
data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
...
@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
...
@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
// gemm
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
// math::matmul<float>(filter_slice, false, col_matrix,
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math
::
matmulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
nullptr
);
p
,
mode
,
biase_data
,
nullptr
);
}
}
}
}
...
@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
...
@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
// FUSION_CONVADDPRELU_OP
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
浏览文件 @
c5f70926
...
@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> ¶m) {
...
@@ -112,7 +112,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
alpha
,
math
::
MatM
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
alpha
,
&
out_slice
,
beta
,
true
,
bias_data
);
&
out_slice
,
beta
,
true
,
bias_data
);
}
}
}
}
...
...
src/operators/kernel/central-arm-func/conv_arm_func.h
浏览文件 @
c5f70926
...
@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> ¶m) {
...
@@ -106,7 +106,7 @@ inline void GemmConv(const ConvParam<CPU> ¶m) {
// gemm
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatM
ul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
false
,
static_cast
<
float
>
(
0
),
false
,
static_cast
<
Otype
*>
(
nullptr
));
static_cast
<
Otype
*>
(
nullptr
));
...
...
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
浏览文件 @
c5f70926
...
@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> ¶m) {
...
@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias_data
=
bias_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias_data
=
bias_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
&
new_bias
,
g
,
&
new_bias
,
g
,
bias_data
.
data
<
float
>
());
bias_data
.
data
<
float
>
());
}
}
}
}
}
}
...
...
src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
浏览文件 @
c5f70926
...
@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> ¶m) {
...
@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> ¶m) {
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
)
,
static_cast
<
float
>
(
1
),
&
out_slice
,
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
}
}
}
...
...
src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
浏览文件 @
c5f70926
...
@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> ¶m) {
...
@@ -93,7 +93,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> ¶m) {
Tensor
filter_slice
=
filter
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
Tensor
out_slice
=
output_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
output_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matm
ul
<
P
,
P
>
(
filter_slice
,
true
,
in_slice
,
false
,
math
::
MatM
ul
<
P
,
P
>
(
filter_slice
,
true
,
in_slice
,
false
,
static_cast
<
P
>
(
1.0
),
&
col_matrix
,
static_cast
<
P
>
(
0.0
));
static_cast
<
P
>
(
1.0
),
&
col_matrix
,
static_cast
<
P
>
(
0.0
));
if
(
data_dim
==
2U
)
{
if
(
data_dim
==
2U
)
{
col2im
(
col
,
dilations
,
strides
,
col2im
(
col
,
dilations
,
strides
,
...
...
src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
浏览文件 @
c5f70926
...
@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> ¶m) {
...
@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> ¶m) {
// gemm
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmulWithBn
<
float
>
(
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
)
,
static_cast
<
float
>
(
1
),
&
out_slice
,
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
}
}
}
...
...
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
浏览文件 @
c5f70926
...
@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> ¶m) {
...
@@ -57,7 +57,7 @@ void FusionFcCompute(const FusionFcParam<CPU> ¶m) {
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
Otype
)
*
classes
);
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
Otype
)
*
classes
);
}
}
math
::
matm
ul
<
Itype
,
Otype
>
(
x_matrix
,
false
,
y_matrix
,
false
,
math
::
MatM
ul
<
Itype
,
Otype
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
false
);
false
);
}
}
...
...
src/operators/kernel/central-arm-func/gru_arm_func.h
浏览文件 @
c5f70926
...
@@ -25,18 +25,16 @@ limitations under the License. */
...
@@ -25,18 +25,16 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
Device
,
typename
T
>
using
Tensor
=
framework
::
Tensor
;
template
<
typename
DeviceType
,
typename
T
>
inline
void
ReorderInitState
(
const
framework
::
Tensor
&
src
,
inline
void
ReorderInitState
(
const
framework
::
Tensor
&
src
,
std
::
vector
<
size_t
>
index_lod
,
std
::
vector
<
size_t
>
index_lod
,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
Device
Type
,
T
>
row_shuffle
;
math
::
CopyMatrixRowsFunctor
<
Device
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
());
dst
->
mutable_data
<
T
>
(
src
.
dims
());
row_shuffle
(
src
,
index_lod
,
dst
,
indexed_src
);
row_shuffle
(
src
,
index_lod
,
dst
,
indexed_src
);
}
}
template
<
typename
P
>
template
<
typename
T
>
void
GruCompute
(
const
GruParam
<
CPU
>&
param
)
{
void
GruCompute
(
const
GruParam
<
CPU
>&
param
)
{
auto
*
input
=
param
.
InputInput
();
auto
*
input
=
param
.
InputInput
();
auto
*
h0
=
param
.
InputH0
();
auto
*
h0
=
param
.
InputH0
();
...
@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) {
...
@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) {
bool
is_reverse
=
param
.
IsReverse
();
bool
is_reverse
=
param
.
IsReverse
();
math
::
LoDTensor2BatchFunctor
<
CPU
,
float
>
to_batch
;
math
::
LoDTensor2BatchFunctor
<
CPU
,
float
>
to_batch
;
to_batch
(
*
input
,
batch_gate
,
true
,
is_reverse
);
to_batch
(
*
input
,
batch_gate
,
true
,
is_reverse
);
// math::ClearTensor<CPU, float> clearTensor;
// clearTensor(batch_gate);
if
(
bias
)
{
if
(
bias
)
{
math
::
RowwiseAdd
<
CPU
,
float
>
add_bias
;
math
::
RowwiseAdd
<
CPU
,
float
>
add_bias
;
add_bias
(
*
batch_gate
,
*
bias
,
batch_gate
);
add_bias
(
*
batch_gate
,
*
bias
,
batch_gate
);
...
@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) {
...
@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) {
gru_value
.
gate_weight
=
const_cast
<
float
*>
(
weight_data
);
gru_value
.
gate_weight
=
const_cast
<
float
*>
(
weight_data
);
gru_value
.
state_weight
=
gru_value
.
state_weight
=
const_cast
<
float
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
const_cast
<
float
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Tensor
ordered_h0
;
std
::
vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
std
::
vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// Since the batch computing for GRU reorders the input sequences
...
@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) {
...
@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) {
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
// BUG
framework
::
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
framework
::
Tensor
reset_hidden_prev_t
=
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
framework
::
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
float
>
();
gru_value
.
output_value
=
hidden_t
.
data
<
float
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
float
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
float
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
float
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
float
>
();
...
@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) {
...
@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) {
}
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
// GRU_OP
src/operators/kernel/central-arm-func/mul_arm_func.h
浏览文件 @
c5f70926
...
@@ -19,40 +19,6 @@ limitations under the License. */
...
@@ -19,40 +19,6 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
// 1、如果x,y维度都是2维,
// x = [[1,2], y = [[5,6],
// [3,4]] [7,8]]
// 运算结果为正常矩阵相乘。结果 out =
// [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
//
// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
// x = [[[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]],
// [[1,2,3,4],
// [2,3,4,5],
// [3,4,5,6]]]
// y = [[[1,2]],
// [[3,4]],
// [[5,6]],
// [[7,8]]]
// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘,得到6,
// [x_num_col_dims,xdim.size())部分4相乘,得到4,
// 将Tensor x的dims重写成(6,4)
// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘,得到4,
// [y_num_col_dims,ydim.size())部分1,2相乘,得到2,
// 将Tensor y的dims重写成(4,2)
// 并不影响x,y在内存中的分布。
// x = [[1,2,3,4], y = [[1,2],
// [2,3,4,5], [3,4],
// [3,4,5,6], 矩阵乘法 [5,6],
// [1,2,3,4], [7,8]]
// [2,3,4,5],
// [3,4,5,6]]
// 结果x(6行4列)乘y(4行2列),按1中矩阵相乘,结果out(6行2列)
template
<
typename
P
>
template
<
typename
P
>
void
MulCompute
(
const
MulParam
<
CPU
>
&
param
)
{
void
MulCompute
(
const
MulParam
<
CPU
>
&
param
)
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_x
=
param
.
InputX
();
...
@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> ¶m) {
...
@@ -73,12 +39,12 @@ void MulCompute(const MulParam<CPU> ¶m) {
}
}
if
(
param
.
InputX
()
->
type
()
==
typeid
(
int8_t
))
{
if
(
param
.
InputX
()
->
type
()
==
typeid
(
int8_t
))
{
out
->
mutable_data
<
int32_t
>
();
out
->
mutable_data
<
int32_t
>
();
math
::
matm
ul
<
int8_t
,
int32_t
>
(
x_matrix
,
false
,
y_matrix
,
false
,
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
static_cast
<
float
>
(
0
));
}
else
{
}
else
{
out
->
mutable_data
<
float
>
();
out
->
mutable_data
<
float
>
();
math
::
matm
ul
<
float
,
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
math
::
MatM
ul
<
float
,
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
static_cast
<
float
>
(
0
));
}
}
...
...
src/operators/kernel/cl/fusion_fc_kernel.cpp
浏览文件 @
c5f70926
...
@@ -94,27 +94,19 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> ¶m, cl_context context,
...
@@ -94,27 +94,19 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> ¶m, cl_context context,
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
}
// for (int i = 0; i < out->numel(); i++) {
math
::
MatMul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
false
);
out
,
static_cast
<
float
>
(
1
),
false
);
out_image
->
InitEmptyImage
(
context
,
commandQueue
,
out
->
dims
());
out_image
->
InitEmptyImage
(
context
,
commandQueue
,
out
->
dims
());
framework
::
TensorToCLImage
(
out
,
out_image
,
context
,
commandQueue
,
kernel1
);
framework
::
TensorToCLImage
(
out
,
out_image
,
context
,
commandQueue
,
kernel1
);
DLOG
<<
*
out
;
delete
(
input_x
);
delete
(
input_x
);
delete
(
input_y
);
delete
(
input_y
);
delete
(
input_z
);
delete
(
input_z
);
delete
(
out
);
delete
(
out
);
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
}
template
<
>
template
<
>
void
FusionFcKernel
<
GPU_CL
,
float
>::
Compute
(
void
FusionFcKernel
<
GPU_CL
,
float
>::
Compute
(
const
FusionFcParam
<
GPU_CL
>
&
param
)
{
const
FusionFcParam
<
GPU_CL
>
&
param
)
{
...
...
src/operators/kernel/mali/fushion_fc_kernel.cpp
浏览文件 @
c5f70926
...
@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute(
...
@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute(
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
DLOG
<<
out_data
[
i
];
DLOG
<<
out_data
[
i
];
}
}
math
::
matm
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
math
::
MatM
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
));
out
,
static_cast
<
float
>
(
1
));
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
// if (out_dim.size() != 2) {
...
...
src/operators/kernel/mali/mul_kernel.cpp
浏览文件 @
c5f70926
...
@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> ¶m) {
...
@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> ¶m) {
if
(
out_dim
.
size
()
!=
2
)
{
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
}
math
::
matm
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
math
::
MatM
ul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
out
,
static_cast
<
float
>
(
0
));
if
(
out_dim
.
size
()
!=
2
)
{
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
(
out_dim
);
out
->
Resize
(
out_dim
);
...
...
src/operators/math/math_func_neon.h
浏览文件 @
c5f70926
...
@@ -38,7 +38,11 @@ limitations under the License. */
...
@@ -38,7 +38,11 @@ limitations under the License. */
*
*
* (this is the zlib license)
* (this is the zlib license)
*/
*/
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#pragma once
#pragma once
#include <arm_neon.h>
#include <arm_neon.h>
#define c_inv_mant_mask ~0x7f800000u
#define c_inv_mant_mask ~0x7f800000u
...
@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) {
...
@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) {
static
inline
float32x4_t
div_ps
(
float32x4_t
a
,
float32x4_t
b
)
{
static
inline
float32x4_t
div_ps
(
float32x4_t
a
,
float32x4_t
b
)
{
float32x4_t
reciprocal
=
vrecpeq_f32
(
b
);
float32x4_t
reciprocal
=
vrecpeq_f32
(
b
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
b
,
reciprocal
),
reciprocal
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
b
,
reciprocal
),
reciprocal
);
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return
vmulq_f32
(
a
,
reciprocal
);
return
vmulq_f32
(
a
,
reciprocal
);
}
}
static
inline
float32x4_t
pow_ps
(
float32x4_t
a
,
float32x4_t
b
)
{
static
inline
float32x4_t
pow_ps
(
float32x4_t
a
,
float32x4_t
b
)
{
// pow(x, m) = exp(m * log(x))
return
exp_ps
(
vmulq_f32
(
b
,
log_ps
(
a
)));
return
exp_ps
(
vmulq_f32
(
b
,
log_ps
(
a
)));
}
}
#endif // __ARM_NEON__
src/operators/math/math_function.cpp
浏览文件 @
c5f70926
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include <cstring>
#include <string>
#include <string>
#include "common/enforce.h"
#include "framework/data_type.h"
#include "framework/data_type.h"
#include "framework/tensor.h"
#include "framework/tensor.h"
#include "operators/math/gemm.h"
#include "operators/math/gemm.h"
...
@@ -35,13 +35,13 @@ struct TensorSetConstant {
...
@@ -35,13 +35,13 @@ struct TensorSetConstant {
float
value_
;
float
value_
;
};
};
void
set_c
onstant
(
framework
::
Tensor
*
tensor
,
float
value
)
{
void
SetC
onstant
(
framework
::
Tensor
*
tensor
,
float
value
)
{
framework
::
VisitDataType
(
framework
::
ToDataType
(
tensor
->
type
()),
framework
::
VisitDataType
(
framework
::
ToDataType
(
tensor
->
type
()),
TensorSetConstant
(
tensor
,
value
));
TensorSetConstant
(
tensor
,
value
));
}
}
template
<
>
template
<
>
void
matm
ul
<
float
,
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
<
float
,
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
float
*
bias
)
{
float
beta
,
bool
relu
,
float
*
bias
)
{
...
@@ -50,7 +50,7 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -50,7 +50,7 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
auto
dim_out
=
matrix_out
->
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int
M
=
dim_out
[
0
];
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
N
=
dim_out
[
1
];
...
@@ -72,7 +72,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -72,7 +72,6 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
}
}
#ifdef _OPENMP
#ifdef _OPENMP
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#else
#else
...
@@ -92,19 +91,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -92,19 +91,18 @@ void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
}
}
}
}
template
<
>
void
MatMulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmulWithBn
<
float
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
int
group
,
float
*
bias
)
{
framework
::
Tensor
*
new_bias
,
int
group
,
float
*
bias
)
{
Gemm
gemm
;
Gemm
gemm
;
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int
M
=
dim_out
[
0
];
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
N
=
dim_out
[
1
];
...
@@ -122,7 +120,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -122,7 +120,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#endif
#endif
}
}
void
matm
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
float
*
bias
,
float
*
bias1
)
{
...
@@ -132,7 +130,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -132,7 +130,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
auto
dim_out
=
matrix_out
->
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int
M
=
dim_out
[
0
];
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
N
=
dim_out
[
1
];
...
@@ -146,7 +144,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -146,7 +144,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
gemm
.
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
gemm
.
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
p
,
mode
,
bias
,
bias1
);
#endif
#endif
}
}
...
...
src/operators/math/math_function.h
浏览文件 @
c5f70926
...
@@ -14,7 +14,6 @@ limitations under the License. */
...
@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#pragma once
#include <cmath>
#include <string>
#include <string>
#include "framework/tensor.h"
#include "framework/tensor.h"
...
@@ -22,37 +21,37 @@ namespace paddle_mobile {
...
@@ -22,37 +21,37 @@ namespace paddle_mobile {
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
void
set_c
onstant
(
framework
::
Tensor
*
tensor
,
float
value
);
void
SetC
onstant
(
framework
::
Tensor
*
tensor
,
float
value
);
template
<
typename
Itype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Otype
>
void
matm
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
=
false
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
=
false
,
Otype
*
bias
=
nullptr
);
Otype
*
bias
=
nullptr
);
template
<
typename
Itype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Otype
>
void
matm
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
Otype
*
bias
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
Otype
*
bias
,
bool
addOnRow
);
bool
addOnRow
);
template
<
typename
T
>
void
MatMulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
,
T
*
bias
=
nullptr
);
int
group
,
float
*
bias
=
nullptr
);
void
matm
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
float
*
bias
,
float
*
bias1
);
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
Device
,
typename
T
>
struct
ClearTensor
{
struct
ClearTensor
{
void
operator
()(
framework
::
Tensor
*
tensor
);
void
operator
()(
framework
::
Tensor
*
tensor
);
};
};
template
<
typename
Device
Type
,
typename
T
>
template
<
typename
Device
,
typename
T
>
struct
RowwiseAdd
{
struct
RowwiseAdd
{
void
operator
()(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
vec
,
void
operator
()(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
output
);
framework
::
Tensor
*
output
);
...
...
src/operators/math/math_function_int8.cpp
浏览文件 @
c5f70926
...
@@ -22,7 +22,7 @@ namespace operators {
...
@@ -22,7 +22,7 @@ namespace operators {
namespace
math
{
namespace
math
{
template
<
>
template
<
>
void
matm
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
,
float
beta
,
bool
relu
,
int32_t
*
bias
,
...
@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -32,7 +32,7 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
auto
dim_out
=
matrix_out
->
dims
();
auto
dim_out
=
matrix_out
->
dims
();
PADDLE_MOBILE_ENFORCE
(
PADDLE_MOBILE_ENFORCE
(
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
dim_a
.
size
()
==
2
&&
dim_b
.
size
()
==
2
&&
dim_out
.
size
()
==
2
,
"The input and output of
matm
ul be matrix"
);
"The input and output of
MatM
ul be matrix"
);
int32_t
M
=
dim_out
[
0
];
int32_t
M
=
dim_out
[
0
];
int32_t
N
=
dim_out
[
1
];
int32_t
N
=
dim_out
[
1
];
...
@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -96,11 +96,11 @@ void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
}
}
template
<
>
template
<
>
void
matm
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
MatM
ul
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
)
{
float
beta
,
bool
relu
,
int32_t
*
bias
)
{
matm
ul
<
int8_t
,
int32_t
>
(
matrix_a
,
trans_a
,
matrix_b
,
trans_b
,
alpha
,
MatM
ul
<
int8_t
,
int32_t
>
(
matrix_a
,
trans_a
,
matrix_b
,
trans_b
,
alpha
,
matrix_out
,
beta
,
relu
,
bias
,
false
);
matrix_out
,
beta
,
relu
,
bias
,
false
);
}
}
...
...
src/operators/math/softmax.cpp
浏览文件 @
c5f70926
...
@@ -15,154 +15,131 @@ limitations under the License. */
...
@@ -15,154 +15,131 @@ limitations under the License. */
#ifdef SOFTMAX_OP
#ifdef SOFTMAX_OP
#include "operators/math/softmax.h"
#include "operators/math/softmax.h"
#include "common/types.h"
#ifdef __ARM_NEON
#include <math.h>
#include <math.h>
#include <algorithm>
#include <algorithm>
#include <limits>
#include "common/types.h"
#include "operators/math/math_func_neon.h"
#include "operators/math/math_func_neon.h"
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
using
framework
::
DDim
;
using
framework
::
Tensor
;
template
<
typename
T
>
class
SoftmaxFuntor
<
CPU
,
T
>
{
#ifdef __ARM_NEON
void
sum
(
float
*
input
,
float
*
sumptr
,
int
inner_size
,
int
outter_size
)
{
float32x4_t
acc
=
vdupq_n_f32
(
0
);
float
sum_
=
0
;
for
(
int
i
=
0
;
i
<
outter_size
;
++
i
)
{
float
*
input_outer_ptr
=
input
+
i
*
inner_size
;
int
nn
=
inner_size
>>
2
;
int
left
=
inner_size
-
(
nn
<<
2
);
for
(;
nn
>
0
;
nn
--
)
{
float32x4_t
vec_input
=
vld1q_f32
(
input_outer_ptr
);
acc
=
vaddq_f32
(
acc
,
vec_input
);
input_outer_ptr
+=
4
;
}
float32x2_t
vsum_
=
vadd_f32
(
vget_high_f32
(
acc
),
vget_low_f32
(
acc
));
sum_
=
vget_lane_f32
(
vsum_
,
0
)
+
vget_lane_f32
(
vsum_
,
1
);
for
(;
left
>
0
;
left
--
)
{
sum_
+=
*
input_outer_ptr
;
input_outer_ptr
++
;
}
}
for
(
int
j
=
0
;
j
<
inner_size
*
outter_size
;
++
j
)
{
sumptr
[
j
]
=
sum_
;
}
}
void
SoftmaxCacl
(
const
Tensor
*
X
,
Tensor
*
Y
)
{
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
const
float
*
input
=
X
->
data
<
float
>
();
#ifndef __aarch64__
const
DDim
&
dDim
=
X
->
dims
();
inline
float32_t
vmaxvq_f32
(
const
float32x4_t
&
r
)
{
int
axis_index
=
1
;
float32x2_t
v
=
vmax_f32
(
vget_high_f32
(
r
),
vget_low_f32
(
r
));
if
(
dDim
.
size
()
<
4
)
{
return
vget_lane_f32
(
vpmax_f32
(
v
,
v
),
0
);
axis_index
=
0
;
}
}
DDim
outer_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
0
,
axis_index
+
1
);
DDim
inner_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
axis_index
+
1
,
dDim
.
size
());
int
out_size
=
paddle_mobile
::
framework
::
product
(
outer_ddim
);
int
inner_size
=
paddle_mobile
::
framework
::
product
(
inner_ddim
);
auto
*
max_ptr
=
new
float
[
inner_size
*
out_size
];
// max
for
(
int
j
=
0
;
j
<
out_size
;
++
j
)
{
const
float
*
input_outer_ptr
=
input
+
j
*
inner_size
;
float
*
max_outer_ptr
=
max_ptr
+
j
*
inner_size
;
float
max_
=
0
;
for
(
int
i
=
0
;
i
<
inner_size
;
++
i
)
{
const
float
*
input_inner_ptr
=
input_outer_ptr
+
i
;
max_
=
std
::
max
(
max_
,
input_inner_ptr
[
0
]);
}
for
(
int
k
=
0
;
k
<
inner_size
;
++
k
)
{
max_outer_ptr
[
k
]
=
max_
;
}
}
// exp(value - max)
float
*
exp_sub_max
=
new
float
[
inner_size
*
out_size
];
float
*
exp_sub_max_ptr
=
&
exp_sub_max
[
0
];
for
(
int
l
=
0
;
l
<
out_size
;
++
l
)
{
const
float
*
input_outer_ptr
=
input
+
l
*
inner_size
;
float
*
max_outer_ptr
=
max_ptr
+
l
*
inner_size
;
int
nn
=
inner_size
>>
2
;
int
left
=
inner_size
-
(
nn
<<
2
);
for
(;
nn
>
0
;
nn
--
)
{
float32x4_t
vec_input
=
vld1q_f32
(
input_outer_ptr
);
float32x4_t
vec_max
=
vld1q_f32
(
max_outer_ptr
);
float32x4_t
vec_sub
=
vsubq_f32
(
vec_input
,
vec_max
);
float32x4_t
vec_exp
=
exp_ps
(
vec_sub
);
vst1q_f32
(
exp_sub_max_ptr
,
vec_exp
);
input_outer_ptr
+=
4
;
max_outer_ptr
+=
4
;
exp_sub_max_ptr
+=
4
;
}
for
(;
left
>
0
;
left
--
)
{
*
exp_sub_max_ptr
=
expf
(
*
input_outer_ptr
-
*
max_outer_ptr
);
input_outer_ptr
++
;
inline
float32_t
vaddvq_f32
(
const
float32x4_t
&
r
)
{
max_outer_ptr
++
;
float32x2_t
v
=
vadd_f32
(
vget_high_f32
(
r
),
vget_low_f32
(
r
))
;
exp_sub_max_ptr
++
;
return
vget_lane_f32
(
vpadd_f32
(
v
,
v
),
0
)
;
}
}
}
#endif // __aarch64__
float
*
sumptr
=
new
float
[
inner_size
*
out_size
];
#endif // __ARM_NEON__
// sum exp
sum
(
exp_sub_max
,
sumptr
,
inner_size
,
out_size
);
float
find_max
(
const
float
*
input
,
const
int
num_classes
)
{
// div
int
remain
=
num_classes
;
auto
*
out_ptr
=
Y
->
mutable_data
<
float
>
();
float
max
=
-
std
::
numeric_limits
<
float
>::
max
();
for
(
int
l
=
0
;
l
<
out_size
;
++
l
)
{
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
const
float
*
input_outer_ptr
=
exp_sub_max
+
l
*
inner_size
;
int
loop
=
num_classes
>>
3
;
float
*
output_outer_ptr
=
out_ptr
+
l
*
inner_size
;
remain
=
num_classes
&
0x7
;
float
*
sum_outer_ptr
=
sumptr
+
l
*
inner_size
;
float32x4_t
__max
=
vdupq_n_f32
(
max
)
;
int
nn
=
inner_size
>>
2
;
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
input
+=
8
)
{
int
left
=
inner_size
-
(
nn
<<
2
);
float32x4_t
x0
=
vld1q_f32
(
input
);
for
(;
nn
>
0
;
nn
--
)
{
float32x4_t
x1
=
vld1q_f32
(
input
+
4
);
float32x4_t
vec_input
=
vld1q_f32
(
input_outer_ptr
);
__max
=
vmaxq_f32
(
x0
,
__max
);
float32x4_t
vec_sum
=
vld1q_f32
(
sum_outer_ptr
);
__max
=
vmaxq_f32
(
x1
,
__max
);
float32x4_t
vec_div
=
div_ps
(
vec_input
,
vec_sum
);
}
vst1q_f32
(
output_outer_ptr
,
vec_div
);
max
=
vmaxvq_f32
(
__max
);
input_outer_ptr
+=
4
;
#endif
output_outer_ptr
+=
4
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
sum_outer_ptr
+=
4
;
max
=
std
::
max
(
max
,
input
[
i
])
;
}
}
for
(;
left
>
0
;
left
--
)
{
return
max
;
*
output_outer_ptr
=
(
*
input_outer_ptr
)
/
(
*
sum_outer_ptr
);
}
input_outer_ptr
++
;
output_outer_ptr
++
;
template
<
>
sum_outer_ptr
++
;
void
SoftmaxFuntor
<
CPU
,
float
>::
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
const
framework
::
DDim
&
dims
=
X
->
dims
();
int
batch_size
=
dims
[
0
];
int
num_classes
=
dims
[
dims
.
size
()
-
1
];
int
channels
=
X
->
numel
()
/
batch_size
/
num_classes
;
const
float
*
x
=
X
->
data
<
float
>
();
float
*
y
=
Y
->
mutable_data
<
float
>
();
#pragma omp parallel for collapse(2)
for
(
int
batch
=
0
;
batch
<
X
->
dims
()[
0
];
++
batch
)
{
for
(
int
channel
=
0
;
channel
<
channels
;
++
channel
)
{
size_t
offset
=
(
batch
*
channels
+
channel
)
*
num_classes
;
const
float
*
input
=
x
+
offset
;
float
*
output
=
y
+
offset
;
// find max
float
max
=
find_max
(
input
,
num_classes
);
// exp(x - max)
int
remain
=
num_classes
;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
int
loop
=
num_classes
>>
3
;
remain
=
num_classes
&
0x7
;
float32x4_t
__max
=
vdupq_n_f32
(
max
);
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
input
+=
8
,
output
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
input
);
float32x4_t
x1
=
vld1q_f32
(
input
+
4
);
x0
=
vsubq_f32
(
x0
,
__max
);
x1
=
vsubq_f32
(
x1
,
__max
);
x0
=
exp_ps
(
x0
);
x1
=
exp_ps
(
x1
);
vst1q_f32
(
output
,
x0
);
vst1q_f32
(
output
+
4
,
x1
);
}
#endif // __ARM_NEON__
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
output
[
i
]
=
std
::
expf
(
input
[
i
]
-
max
);
}
}
// sum(exp(x - max))
float
sum
=
0.
f
;
output
=
y
+
offset
;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t
__sum
=
vdupq_n_f32
(
0.
f
);
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
output
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
output
);
float32x4_t
x1
=
vld1q_f32
(
output
+
4
);
__sum
=
vaddq_f32
(
x0
,
__sum
);
__sum
=
vaddq_f32
(
x1
,
__sum
);
}
sum
+=
vaddvq_f32
(
__sum
);
#endif // __ARM_NEON__
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
sum
+=
output
[
i
];
}
}
// exp(x - max) / sum
float
inv_sum
=
1.
f
/
sum
;
output
=
y
+
offset
;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t
__inv_sum
=
vdupq_n_f32
(
inv_sum
);
for
(
int
i
=
0
;
i
<
loop
;
++
i
,
output
+=
8
)
{
float32x4_t
x0
=
vld1q_f32
(
output
);
float32x4_t
x1
=
vld1q_f32
(
output
+
4
);
x0
=
vmulq_f32
(
x0
,
__inv_sum
);
x1
=
vmulq_f32
(
x1
,
__inv_sum
);
vst1q_f32
(
output
,
x0
);
vst1q_f32
(
output
+
4
,
x0
);
}
}
#else
#endif // ARM_NEON
public:
void
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
const
DDim
dDim
=
X
->
dims
();
int
dim1
=
dDim
[
dDim
.
size
()
-
1
];
int
dim0
=
X
->
numel
()
/
dim1
/
dDim
[
0
];
framework
::
DDim
matrix_shape
=
{
dim0
,
dim1
};
for
(
int
i
=
0
;
i
<
dDim
[
0
];
++
i
)
{
framework
::
Tensor
sub_X
=
X
->
Slice
(
i
,
i
+
1
);
framework
::
Tensor
sub_Y
=
Y
->
Slice
(
i
,
i
+
1
);
sub_X
.
Resize
(
matrix_shape
);
sub_Y
.
Resize
(
matrix_shape
);
for
(
int
j
=
0
;
j
<
dim0
;
j
++
)
{
framework
::
Tensor
sub_x
=
sub_X
.
Slice
(
j
,
j
+
1
);
framework
::
Tensor
sub_y
=
sub_Y
.
Slice
(
j
,
j
+
1
);
#ifdef __ARM_NEON
SoftmaxCacl
(
&
sub_x
,
&
sub_y
);
#endif
#endif
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
output
[
i
]
*=
inv_sum
;
}
}
}
}
}
}
};
}
template
class
SoftmaxFuntor
<
CPU
,
float
>;
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif // SOFTMAX_OP
src/operators/math/softmax.h
浏览文件 @
c5f70926
...
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
...
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef SOFTMAX_OP
#ifdef SOFTMAX_OP
#pragma once
#pragma once
#include "framework/tensor.h"
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
template
<
typename
Device
Type
,
typename
T
>
template
<
typename
Device
,
typename
T
>
class
SoftmaxFuntor
{
class
SoftmaxFuntor
{
public:
public:
void
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
);
void
operator
()(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
);
};
};
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
test/CMakeLists.txt
浏览文件 @
c5f70926
...
@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH)
...
@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE
(
test-inference-api framework/test_inference_api.cpp
)
ADD_EXECUTABLE
(
test-inference-api framework/test_inference_api.cpp
)
target_link_libraries
(
test-inference-api paddle-mobile
)
target_link_libraries
(
test-inference-api paddle-mobile
)
# gen test log
# gen test
# gen test
ADD_EXECUTABLE
(
test-optimize framework/test_optimize.cpp
)
ADD_EXECUTABLE
(
test-optimize framework/test_optimize.cpp
)
target_link_libraries
(
test-optimize paddle-mobile
)
target_link_libraries
(
test-optimize paddle-mobile
)
#gen test
#gen test
ADD_EXECUTABLE
(
test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h
)
ADD_EXECUTABLE
(
test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-pool-op paddle-mobile
)
target_link_libraries
(
test-pool-op paddle-mobile
)
#gen test
#gen test
ADD_EXECUTABLE
(
test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h
)
ADD_EXECUTABLE
(
test-softmax
-op
operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-softmax paddle-mobile
)
target_link_libraries
(
test-softmax
-op
paddle-mobile
)
# gen test
# gen test
ADD_EXECUTABLE
(
test-gemm-accuracy common/test_gemm_accuracy.cpp
)
ADD_EXECUTABLE
(
test-gemm-accuracy common/test_gemm_accuracy.cpp
)
...
...
test/common/test_gemm_perf.cpp
浏览文件 @
c5f70926
...
@@ -73,14 +73,14 @@ int main() {
...
@@ -73,14 +73,14 @@ int main() {
// float
// float
// warm-up 10 times
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
float
,
float
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
float
,
float
>
(
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
false
,
nullptr
);
false
,
nullptr
);
}
}
auto
time_start0
=
time
();
auto
time_start0
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
float
,
float
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
float
,
float
>
(
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
false
,
nullptr
);
false
,
nullptr
);
}
}
...
@@ -91,14 +91,14 @@ int main() {
...
@@ -91,14 +91,14 @@ int main() {
// int8_t without bias
// int8_t without bias
// warm-up 10 times
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
static_cast
<
float
>
(
0
));
static_cast
<
float
>
(
0
));
}
}
auto
time_start1
=
time
();
auto
time_start1
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
static_cast
<
float
>
(
0
));
static_cast
<
float
>
(
0
));
}
}
...
@@ -109,13 +109,13 @@ int main() {
...
@@ -109,13 +109,13 @@ int main() {
// int8_t with bias, column element wise add
// int8_t with bias, column element wise add
// warm-up 10 times
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
}
auto
time_start2
=
time
();
auto
time_start2
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
}
...
@@ -126,13 +126,13 @@ int main() {
...
@@ -126,13 +126,13 @@ int main() {
// int8_t with bias, row element wise add
// int8_t with bias, row element wise add
// warm-up 10 times
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
}
auto
time_start3
=
time
();
auto
time_start3
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
}
...
@@ -143,13 +143,13 @@ int main() {
...
@@ -143,13 +143,13 @@ int main() {
// int8_t with bias&relu
// int8_t with bias&relu
// warm-up 10 times
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data_col
,
false
);
static_cast
<
float
>
(
0
),
true
,
bias_data_col
,
false
);
}
}
auto
time_start4
=
time
();
auto
time_start4
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matm
ul
<
int8_t
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
MatM
ul
<
int8_t
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data_col
,
false
);
static_cast
<
float
>
(
0
),
true
,
bias_data_col
,
false
);
}
}
...
...
test/operators/test_softmax_op.cpp
浏览文件 @
c5f70926
...
@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <math.h>
#include <limits>
#include "../test_include.h"
#include "../test_include.h"
#include "operators/softmax_op.h"
#include "operators/softmax_op.h"
int
main
()
{
namespace
paddle_mobile
{
paddle_mobile
::
framework
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
auto
program
=
loader
.
Load
(
std
::
string
(
g_mobilenet
));
void
Softmax
(
const
framework
::
Tensor
*
X
,
framework
::
Tensor
*
Y
)
{
if
(
program
.
originProgram
==
nullptr
)
{
const
framework
::
DDim
&
dims
=
X
->
dims
();
DLOG
<<
"program read file"
;
int
batch_size
=
dims
[
0
];
int
num_classes
=
dims
[
dims
.
size
()
-
1
];
int
channels
=
X
->
numel
()
/
batch_size
/
num_classes
;
const
float
*
x
=
X
->
data
<
float
>
();
float
*
y
=
Y
->
mutable_data
<
float
>
();
for
(
int
batch
=
0
;
batch
<
batch_size
;
++
batch
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
size_t
offset
=
(
batch
*
channels
+
c
)
*
num_classes
;
const
float
*
input
=
x
+
offset
;
float
*
output
=
y
+
offset
;
float
max
=
-
std
::
numeric_limits
<
float
>::
max
();
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
max
=
(
input
[
j
]
>
max
)
?
input
[
j
]
:
max
;
}
float
sum
=
0.
f
;
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
float
tmp
=
std
::
expf
(
input
[
j
]
-
max
);
sum
+=
tmp
;
output
[
j
]
=
tmp
;
}
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
output
[
j
]
/=
sum
;
}
}
}
}
int
TestSoftmaxOp
(
const
std
::
vector
<
int
>
input_shape
)
{
framework
::
DDim
dims
=
framework
::
make_ddim
(
input_shape
);
VariableNameMap
inputs
;
VariableNameMap
outputs
;
auto
scope
=
std
::
make_shared
<
framework
::
Scope
>
();
inputs
[
"X"
]
=
std
::
vector
<
std
::
string
>
({
"input"
});
outputs
[
"Out"
]
=
std
::
vector
<
std
::
string
>
({
"output"
});
auto
input_var
=
scope
.
get
()
->
Var
(
"input"
);
auto
input
=
input_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
float
>
(
input
,
dims
,
-
100.0
,
100.0
);
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
framework
::
AttributeMap
attrs
;
auto
*
op
=
new
operators
::
SoftmaxOp
<
CPU
,
float
>
(
"softmax"
,
inputs
,
outputs
,
attrs
,
scope
);
op
->
InferShape
();
op
->
Init
();
op
->
Run
();
framework
::
Tensor
output_cmp
;
float
*
output_cmp_data
=
output_cmp
.
mutable_data
<
float
>
(
output
->
dims
());
Softmax
(
input
,
&
output_cmp
);
const
float
*
output_data
=
output
->
data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
float
gap
=
output_data
[
i
]
-
output_cmp_data
[
i
];
if
(
std
::
abs
(
gap
/
(
output_data
[
i
]
+
1e-5
))
>
1e-3
)
{
LOG
(
kLOG_INFO
)
<<
"output_data["
<<
i
<<
"] = "
<<
output_data
[
i
]
<<
", output_cmp_data["
<<
i
<<
"] = "
<<
output_cmp_data
[
i
];
delete
op
;
exit
(
1
);
}
}
Executor4Test
<
paddle_mobile
::
CPU
,
paddle_mobile
::
operators
::
SoftmaxOp
<
paddle_mobile
::
CPU
,
float
>>
executor
(
program
,
"softmax"
);
paddle_mobile
::
framework
::
Tensor
input
;
SetupTensor
<
float
>
(
&
input
,
{
1
,
1000
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
auto
out_ddim
=
paddle_mobile
::
framework
::
make_ddim
({
1
,
1000
});
auto
output
=
executor
.
Predict
(
input
,
"reshape_0.tmp_0"
,
"softmax_0.tmp_0"
,
out_ddim
);
auto
*
output_ptr
=
output
->
data
<
float
>
();
for
(
int
j
=
0
;
j
<
output
->
numel
();
++
j
)
{
DLOG
<<
" value of output: "
<<
output_ptr
[
j
];
}
}
delete
op
;
return
0
;
}
}
// namespace paddle_mobile
int
main
(
int
argc
,
char
*
argv
[])
{
TestSoftmaxOp
({
128
,
1000
});
TestSoftmaxOp
({
128
,
10
,
1000
});
return
0
;
return
0
;
}
}
tools/pre-commit.hooks/cpplint.hook
浏览文件 @
c5f70926
...
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
...
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for
file
in
$(
git diff
--cached
--name-status
|
awk
'$1 != "D" {print $2}'
|
\
for
file
in
$(
git diff
--cached
--name-status
|
awk
'$1 != "D" {print $2}'
|
\
grep
-v
".pb.cpp"
|
grep
-v
".pb.h"
|
grep
-v
".pb-c.h"
|
grep
-v
".pb-c.c"
|
\
grep
-v
".pb.cpp"
|
grep
-v
".pb.h"
|
grep
-v
".pb-c.h"
|
grep
-v
".pb-c.c"
|
\
grep
-v
"protobuf-c.h"
|
grep
-v
"protobuf-c.c"
|
grep
-v
"paddle_mobile_jni.cpp"
)
;
do
grep
-v
"protobuf-c.h"
|
grep
-v
"protobuf-c.c"
)
;
do
cpplint
$file
;
cpplint
$file
;
TOTAL_ERRORS
=
$(
expr
$TOTAL_ERRORS
+
$?
)
;
TOTAL_ERRORS
=
$(
expr
$TOTAL_ERRORS
+
$?
)
;
done
done
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录