Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
dd0cf789
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dd0cf789
编写于
6月 28, 2018
作者:
R
Ruilong Liu
提交者:
GitHub
6月 28, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
97102280
ccd7c7de
变更
51
展开全部
显示空白变更内容
内联
并排
Showing
51 changed file
with
4246 addition
and
533 deletion
+4246
-533
.gitmodules
.gitmodules
+3
-0
CMakeLists.txt
CMakeLists.txt
+13
-2
src/framework/operator.h
src/framework/operator.h
+12
-0
src/operators/batchnorm_op.cpp
src/operators/batchnorm_op.cpp
+2
-0
src/operators/concat_op.cpp
src/operators/concat_op.cpp
+2
-0
src/operators/elementwise_add_op.cpp
src/operators/elementwise_add_op.cpp
+2
-0
src/operators/feed_op.h
src/operators/feed_op.h
+2
-0
src/operators/fetch_op.h
src/operators/fetch_op.h
+2
-0
src/operators/fusion_conv_add.cpp
src/operators/fusion_conv_add.cpp
+2
-0
src/operators/fusion_conv_add.h
src/operators/fusion_conv_add.h
+12
-0
src/operators/fusion_conv_add_relu_op.h
src/operators/fusion_conv_add_relu_op.h
+5
-0
src/operators/fusion_fc_op.cpp
src/operators/fusion_fc_op.cpp
+2
-0
src/operators/fusion_fc_op.h
src/operators/fusion_fc_op.h
+9
-1
src/operators/kernel/arm/batchnorm_kernel.cpp
src/operators/kernel/arm/batchnorm_kernel.cpp
+2
-210
src/operators/kernel/arm/conv_add_relu_kernel.cpp
src/operators/kernel/arm/conv_add_relu_kernel.cpp
+2
-86
src/operators/kernel/arm/conv_kernel.cpp
src/operators/kernel/arm/conv_kernel.cpp
+2
-82
src/operators/kernel/arm/depthwise_conv_kernel.cpp
src/operators/kernel/arm/depthwise_conv_kernel.cpp
+2
-86
src/operators/kernel/central-arm-func/batchnorm_arm_func.h
src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+234
-0
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
...perators/kernel/central-arm-func/conv_add_relu_arm_func.h
+117
-0
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+115
-0
src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
...erators/kernel/central-arm-func/depthwise_conv_arm_func.h
+116
-0
src/operators/kernel/conv_kernel.h
src/operators/kernel/conv_kernel.h
+0
-15
src/operators/kernel/mali/ACL_Android
src/operators/kernel/mali/ACL_Android
+1
-0
src/operators/kernel/mali/acl_operator.cc
src/operators/kernel/mali/acl_operator.cc
+220
-0
src/operators/kernel/mali/acl_operator.h
src/operators/kernel/mali/acl_operator.h
+1144
-0
src/operators/kernel/mali/acl_tensor.cc
src/operators/kernel/mali/acl_tensor.cc
+160
-0
src/operators/kernel/mali/acl_tensor.h
src/operators/kernel/mali/acl_tensor.h
+128
-0
src/operators/kernel/mali/batchnorm_kernel.cpp
src/operators/kernel/mali/batchnorm_kernel.cpp
+134
-2
src/operators/kernel/mali/concat_kernel.cpp
src/operators/kernel/mali/concat_kernel.cpp
+140
-0
src/operators/kernel/mali/conv_add_kernel.cpp
src/operators/kernel/mali/conv_add_kernel.cpp
+235
-0
src/operators/kernel/mali/conv_kernel.cpp
src/operators/kernel/mali/conv_kernel.cpp
+199
-5
src/operators/kernel/mali/elementwise_add_kernel.cpp
src/operators/kernel/mali/elementwise_add_kernel.cpp
+52
-0
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+77
-0
src/operators/kernel/mali/lrn_kernel.cpp
src/operators/kernel/mali/lrn_kernel.cpp
+148
-0
src/operators/kernel/mali/mul_kernel.cpp
src/operators/kernel/mali/mul_kernel.cpp
+59
-0
src/operators/kernel/mali/pool_kernel.cpp
src/operators/kernel/mali/pool_kernel.cpp
+220
-0
src/operators/kernel/mali/relu_kernel.cpp
src/operators/kernel/mali/relu_kernel.cpp
+136
-0
src/operators/kernel/mali/reshape_kernel.cpp
src/operators/kernel/mali/reshape_kernel.cpp
+60
-0
src/operators/kernel/mali/softmax_kernel.cpp
src/operators/kernel/mali/softmax_kernel.cpp
+137
-0
src/operators/lrn_op.cpp
src/operators/lrn_op.cpp
+2
-0
src/operators/math/depthwiseconv3x3s1p1.cpp
src/operators/math/depthwiseconv3x3s1p1.cpp
+288
-0
src/operators/math/depthwiseconv3x3s1p1.h
src/operators/math/depthwiseconv3x3s1p1.h
+27
-0
src/operators/mul_op.cpp
src/operators/mul_op.cpp
+2
-0
src/operators/pool_op.cpp
src/operators/pool_op.cpp
+2
-0
src/operators/relu_op.cpp
src/operators/relu_op.cpp
+2
-0
src/operators/reshape_op.cpp
src/operators/reshape_op.cpp
+2
-0
src/operators/softmax_op.cpp
src/operators/softmax_op.cpp
+2
-0
tools/android-debug-script/push2android.sh
tools/android-debug-script/push2android.sh
+10
-5
tools/android-debug-script/run_on_android.sh
tools/android-debug-script/run_on_android.sh
+0
-0
tools/build.sh
tools/build.sh
+1
-1
tools/run.sh
tools/run.sh
+0
-38
未找到文件。
.gitmodules
0 → 100644
浏览文件 @
dd0cf789
[submodule "src/operators/kernel/mali/ACL_Android"]
path = src/operators/kernel/mali/ACL_Android
url = https://github.com/halsay/ACL_Android.git
CMakeLists.txt
浏览文件 @
dd0cf789
...
...
@@ -7,7 +7,7 @@ option(USE_EXCEPTION "use std exception" ON)
option
(
LOG_PROFILE
"log profile"
ON
)
# select the platform to build
option
(
CPU
"armv7 with neon"
ON
)
option
(
MALI_GPU
"mali gpu"
O
FF
)
option
(
MALI_GPU
"mali gpu"
O
N
)
option
(
FPGA
"fpga"
OFF
)
set
(
DEBUGING ON
)
if
(
CPU
)
...
...
@@ -16,6 +16,17 @@ endif()
if
(
MALI_GPU
)
add_definitions
(
-DPADDLE_MOBILE_MALI_GPU
)
add_definitions
(
-DUSE_ACL=1
)
add_definitions
(
-DUSE_OPENCL
)
set
(
ACL_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/operators/kernel/mali/ACL_Android
)
include_directories
(
${
ACL_ROOT
}
${
ACL_ROOT
}
/include
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-L
${
ACL_ROOT
}
/build"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute_core"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute_graph"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-L
${
ACL_ROOT
}
/build/opencl-1.2-stubs"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-lOpenCL"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-DUSE_ACL=1"
)
endif
()
if
(
FPGA
)
...
...
src/framework/operator.h
浏览文件 @
dd0cf789
...
...
@@ -138,9 +138,21 @@ class OpKernelBase {
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase
()
{
acl_op_
=
nullptr
;
}
void
*
GetAclOp
()
const
{
return
acl_op_
;
}
void
SetAclOp
(
void
*
op
,
void
*
ob
)
const
{
reinterpret_cast
<
OpKernelBase
<
Dtype
,
P
>
*>
(
ob
)
->
acl_op_
=
op
;
}
#endif
virtual
void
Compute
(
const
P
&
para
)
const
=
0
;
virtual
bool
Init
(
const
P
&
para
)
const
{
return
true
;
};
virtual
~
OpKernelBase
()
=
default
;
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void
*
acl_op_
;
#endif
};
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
...
...
src/operators/batchnorm_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
REGISTER_OPERATOR_CPU
(
batch_norm
,
ops
::
BatchNormOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
batch_norm
);
REGISTER_OPERATOR_MALI_GPU
(
batch_norm
,
ops
::
BatchNormOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/concat_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
REGISTER_OPERATOR_CPU
(
concat
,
ops
::
ConcatOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
concat
);
REGISTER_OPERATOR_MALI_GPU
(
concat
,
ops
::
ConcatOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/elementwise_add_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
REGISTER_OPERATOR_CPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
elementwise_add
);
REGISTER_OPERATOR_MALI_GPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/feed_op.h
浏览文件 @
dd0cf789
...
...
@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
REGISTER_OPERATOR_CPU
(
feed
,
ops
::
FeedOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
feed
);
REGISTER_OPERATOR_MALI_GPU
(
feed
,
ops
::
FeedOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fetch_op.h
浏览文件 @
dd0cf789
...
...
@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
REGISTER_OPERATOR_CPU
(
fetch
,
ops
::
FetchOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
fetch
);
REGISTER_OPERATOR_MALI_GPU
(
fetch
,
ops
::
FetchOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fusion_conv_add.cpp
浏览文件 @
dd0cf789
...
...
@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
REGISTER_OPERATOR_CPU
(
conv_add
,
ops
::
FusionConvAddOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
conv_add
);
REGISTER_OPERATOR_MALI_GPU
(
conv_add
,
ops
::
FusionConvAddOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fusion_conv_add.h
浏览文件 @
dd0cf789
...
...
@@ -68,11 +68,23 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_REGISTER
static
framework
::
FusionOpRegistrar
convadd_registrar
(
new
FusionConvAddMatcher
());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_ADD_REGISTER
static
framework
::
FusionOpRegistrar
convadd_registrar
(
new
FusionConvAddMatcher
());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fusion_conv_add_relu_op.h
浏览文件 @
dd0cf789
...
...
@@ -64,8 +64,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_RELU_REGISTER
#define CONV_ADD_RELU_REGISTER
// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
// FusionConvAddReluOpMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
...
...
src/operators/fusion_fc_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
REGISTER_OPERATOR_CPU
(
fc
,
ops
::
FusionFcOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
fc
);
REGISTER_OPERATOR_MALI_GPU
(
fc
,
ops
::
FusionFcOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fusion_fc_op.h
浏览文件 @
dd0cf789
...
...
@@ -66,11 +66,19 @@ class FusionFcOp
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static
framework
::
FusionOpRegistrar
fc_registrar
(
new
FusionFcMatcher
());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
// static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static
framework
::
FusionOpRegistrar
fc_registrar
(
new
FusionFcMatcher
());
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/kernel/arm/batchnorm_kernel.cpp
浏览文件 @
dd0cf789
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include "operators/kernel/batchnorm_kernel.h"
#include
<cmath>
#include
"operators/kernel/central-arm-func/batchnorm_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -28,215 +28,7 @@ bool BatchNormKernel<CPU, float>::Init(const BatchNormParam ¶) const {
template
<
>
void
BatchNormKernel
<
CPU
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
const
auto
&
x_dims
=
input_x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
H
=
x_dims
[
2
];
const
int
W
=
x_dims
[
3
];
const
int
stride0
=
C
*
H
*
W
;
const
int
stride1
=
H
*
W
;
const
int
stride2
=
W
;
Tensor
*
out
=
param
.
OutputY
();
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
const
float
epsilon
=
param
.
Epsilon
();
const
Tensor
*
mean
=
param
.
InputMean
();
const
Tensor
*
variance
=
param
.
InputVariance
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE
(
C
==
variance
->
numel
(),
"C must equal to variance.numel()"
);
int
HXW
=
H
*
W
;
if
(
HXW
>
32
)
{
int
NXC
=
N
*
C
;
float
*
inv_std_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_scale_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_bias_ptr
=
new
float
[
NXC
*
4
];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for
(
int
i
=
0
;
i
<
C
*
4
;
i
+=
4
)
{
int
index
=
i
/
4
;
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
index
]
+
epsilon
),
0.5
));
inv_std_ptr
[
i
+
1
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
2
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
3
]
=
inv_std_ptr
[
i
];
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_scale_ptr
[
i
+
1
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
2
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
3
]
=
new_scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
index
]
-
mean_ptr
[
index
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_bias_ptr
[
i
+
1
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
2
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
3
]
=
new_bias_ptr
[
i
];
}
for
(
int
j
=
C
*
4
;
j
<
NXC
*
4
;
++
j
)
{
new_scale_ptr
[
j
]
=
new_scale_ptr
[
j
-
C
*
4
];
new_bias_ptr
[
j
]
=
new_bias_ptr
[
j
-
C
*
4
];
}
asm
volatile
(
"subs %[N], %[N], #1
\n\t
"
"blt end_n_%=
\n\t
"
"loop_n_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"blt end_c_%=
\n\t
"
"loop_c_%=:
\n\t
"
"vld1.32 {q9}, [%[new_scale_ptr]]!
\n\t
"
"vld1.32 {q10}, [%[new_bias_ptr]]!
\n\t
"
"mov r6, %[HXW]
\n\t
"
"subs r6, r6, #32
\n\t
"
"blt end_hw_%=
\n\t
"
"loop_hw_%=:
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"subs r6, r6, #32
\n\t
"
"bge loop_hw_%=
\n\t
"
"end_hw_%=:
\n\t
"
"cmp r6, #0
\n\t
"
"bge end_remainder_%=
\n\t
"
"mov r5, #4
\n\t
"
"mul r6, r6, r5
\n\t
"
"add %[input_x_ptr], %[input_x_ptr], r6
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"add %[out_ptr], %[out_ptr], r6
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"end_remainder_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"bge loop_c_%=
\n\t
"
"end_c_%=:
\n\t
"
"subs %[N], %[N], #1
\n\t
"
"bge loop_n_%=
\n\t
"
"end_n_%=:
\n\t
"
:
:
[
input_x_ptr
]
"r"
(
input_x_ptr
),
[
out_ptr
]
"r"
(
out_ptr
),
[
new_scale_ptr
]
"r"
(
new_scale_ptr
),
[
new_bias_ptr
]
"r"
(
new_bias_ptr
),
[
N
]
"r"
(
N
),
[
C
]
"r"
(
C
),
[
HXW
]
"r"
(
HXW
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"r5"
,
"r6"
);
delete
[]
inv_std_ptr
;
delete
[]
new_scale_ptr
;
delete
[]
new_bias_ptr
;
}
else
{
float
*
inv_std_ptr
=
new
float
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
Tensor
new_scale
;
auto
new_scale_ptr
=
new_scale
.
mutable_data
<
float
>
(
make_ddim
({
C
}));
Tensor
new_bias
;
auto
new_bias_ptr
=
new_bias
.
mutable_data
<
float
>
(
make_ddim
({
C
}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
int
tmp_index
=
n
*
stride0
+
i
*
stride1
+
h
*
stride2
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
int
index
=
tmp_index
+
w
;
out_ptr
[
index
]
=
input_x_ptr
[
index
]
*
new_scale_ptr
[
i
]
+
new_bias_ptr
[
i
];
}
}
}
}
}
delete
[]
inv_std_ptr
;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102];
}
BatchnormCompute
<
float
>
(
param
);
}
}
// namespace operators
...
...
src/operators/kernel/arm/conv_add_relu_kernel.cpp
浏览文件 @
dd0cf789
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -28,92 +29,7 @@ bool ConvAddReluKernel<CPU, float>::Init(
template
<
>
void
ConvAddReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddReluParam
&
param
)
const
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
math
::
expand_bias
(
bias
,
axis
,
output
->
dims
());
output
->
ShareDataWith
(
bias
);
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
);
}
}
ConvAddReluCompute
<
float
>
(
param
);
}
template
class
ConvAddReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/conv_kernel.cpp
浏览文件 @
dd0cf789
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -26,88 +27,7 @@ bool ConvKernel<CPU, float>::Init(const ConvParam ¶) const {
template
<
>
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
ConvCompute
<
float
>
(
param
);
}
template
class
ConvKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/depthwise_conv_kernel.cpp
浏览文件 @
dd0cf789
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef DEPTHWISECONV_OP
#include "operators/kernel/depthwise_conv_kernel.h"
#include "operators/kernel/c
onv_kernel
.h"
#include "operators/kernel/c
entral-arm-func/depthwise_conv_arm_func
.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -27,91 +27,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam ¶) const {
template
<
>
void
DepthwiseConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
LOG
(
kLOG_DEBUG
)
<<
param
;
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
// DLOG << " compute end get Attrs " << strides[0];
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
DepthwiseConvCompute
<
float
>
(
param
);
}
template
class
DepthwiseConvKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/central-arm-func/batchnorm_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
BatchnormCompute
(
const
BatchNormParam
&
param
)
{
const
Tensor
*
input_x
=
param
.
InputX
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
const
auto
&
x_dims
=
input_x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
H
=
x_dims
[
2
];
const
int
W
=
x_dims
[
3
];
const
int
stride0
=
C
*
H
*
W
;
const
int
stride1
=
H
*
W
;
const
int
stride2
=
W
;
Tensor
*
out
=
param
.
OutputY
();
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
const
float
epsilon
=
param
.
Epsilon
();
const
Tensor
*
mean
=
param
.
InputMean
();
const
Tensor
*
variance
=
param
.
InputVariance
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE
(
C
==
variance
->
numel
(),
"C must equal to variance.numel()"
);
int
HXW
=
H
*
W
;
if
(
HXW
>
32
)
{
int
NXC
=
N
*
C
;
float
*
inv_std_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_scale_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_bias_ptr
=
new
float
[
NXC
*
4
];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for
(
int
i
=
0
;
i
<
C
*
4
;
i
+=
4
)
{
int
index
=
i
/
4
;
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
index
]
+
epsilon
),
0.5
));
inv_std_ptr
[
i
+
1
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
2
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
3
]
=
inv_std_ptr
[
i
];
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_scale_ptr
[
i
+
1
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
2
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
3
]
=
new_scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
index
]
-
mean_ptr
[
index
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_bias_ptr
[
i
+
1
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
2
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
3
]
=
new_bias_ptr
[
i
];
}
for
(
int
j
=
C
*
4
;
j
<
NXC
*
4
;
++
j
)
{
new_scale_ptr
[
j
]
=
new_scale_ptr
[
j
-
C
*
4
];
new_bias_ptr
[
j
]
=
new_bias_ptr
[
j
-
C
*
4
];
}
asm
volatile
(
"subs %[N], %[N], #1
\n\t
"
"blt end_n_%=
\n\t
"
"loop_n_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"blt end_c_%=
\n\t
"
"loop_c_%=:
\n\t
"
"vld1.32 {q9}, [%[new_scale_ptr]]!
\n\t
"
"vld1.32 {q10}, [%[new_bias_ptr]]!
\n\t
"
"mov r6, %[HXW]
\n\t
"
"subs r6, r6, #32
\n\t
"
"blt end_hw_%=
\n\t
"
"loop_hw_%=:
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"subs r6, r6, #32
\n\t
"
"bge loop_hw_%=
\n\t
"
"end_hw_%=:
\n\t
"
"cmp r6, #0
\n\t
"
"bge end_remainder_%=
\n\t
"
"mov r5, #4
\n\t
"
"mul r6, r6, r5
\n\t
"
"add %[input_x_ptr], %[input_x_ptr], r6
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"add %[out_ptr], %[out_ptr], r6
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"end_remainder_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"bge loop_c_%=
\n\t
"
"end_c_%=:
\n\t
"
"subs %[N], %[N], #1
\n\t
"
"bge loop_n_%=
\n\t
"
"end_n_%=:
\n\t
"
:
:
[
input_x_ptr
]
"r"
(
input_x_ptr
),
[
out_ptr
]
"r"
(
out_ptr
),
[
new_scale_ptr
]
"r"
(
new_scale_ptr
),
[
new_bias_ptr
]
"r"
(
new_bias_ptr
),
[
N
]
"r"
(
N
),
[
C
]
"r"
(
C
),
[
HXW
]
"r"
(
HXW
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"r5"
,
"r6"
);
delete
[]
inv_std_ptr
;
delete
[]
new_scale_ptr
;
delete
[]
new_bias_ptr
;
}
else
{
float
*
inv_std_ptr
=
new
float
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
Tensor
new_scale
;
auto
new_scale_ptr
=
new_scale
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
C
}));
Tensor
new_bias
;
auto
new_bias_ptr
=
new_bias
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
C
}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
int
tmp_index
=
n
*
stride0
+
i
*
stride1
+
h
*
stride2
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
int
index
=
tmp_index
+
w
;
out_ptr
[
index
]
=
input_x_ptr
[
index
]
*
new_scale_ptr
[
i
]
+
new_bias_ptr
[
i
];
}
}
}
}
}
delete
[]
inv_std_ptr
;
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddReluCompute
(
const
FusionConvAddReluParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
math
::
expand_bias
(
bias
,
axis
,
output
->
dims
());
output
->
ShareDataWith
(
bias
);
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/conv_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvCompute
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEPTHWISECONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
DepthwiseConvCompute
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
// DLOG << " compute end get Attrs " << strides[0];
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_kernel.h
浏览文件 @
dd0cf789
...
...
@@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
bool
Init
(
const
ConvParam
&
para
)
const
;
};
inline
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
)
{
bool
filter_1
=
true
,
strides_1
=
true
,
padding_0
=
true
,
dilation_1
=
true
;
for
(
size_t
j
=
0
;
j
<
strides
.
size
();
++
j
)
{
filter_1
=
filter_1
&&
(
static_cast
<
int
>
(
filter_dim
[
j
+
2
])
==
1
);
strides_1
=
strides_1
&&
(
strides
[
j
]
==
1
);
padding_0
=
padding_0
&&
(
paddings
[
j
]
==
0
);
dilation_1
=
dilation_1
&&
(
dilations
[
j
]
==
1
);
}
return
!
(
filter_1
&&
strides_1
&&
padding_0
&&
dilation_1
);
}
}
// namespace operators
}
// namespace paddle_mobile
...
...
ACL_Android
@
591027fc
Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
src/operators/kernel/mali/acl_operator.cc
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if USE_ACL == 1
#include "acl_operator.h"
unsigned
int
bypass_acl_class_layer
=
(
0
|
FLAGS_ENABLE_ACL_CONCAT
|
/*0xffffffff |*/
/*FLAGS_ENABLE_ACL_FC |*/
/*FLAGS_ENABLE_ACL_LRN
|*/
0
);
int
enable_schedule
=
0
;
#ifdef USE_PROFILING
#include "arm_neon.h"
unsigned
int
acl_log_flags
=
(
0
|
MASK_LOG_APP_TIME
|
/*MASK_LOG_ALLOCATE | */
/*MASK_LOG_ALLOCATE | */
/*MASK_LOG_RUN | */
/*MASK_LOG_CONFIG | */
/*MASK_LOG_COPY | */
MASK_LOG_ABSVAL
|
MASK_LOG_BNLL
|
MASK_LOG_CONV
|
MASK_LOG_FC
|
MASK_LOG_LRN
|
MASK_LOG_POOLING
|
MASK_LOG_RELU
|
MASK_LOG_SIGMOID
|
MASK_LOG_SOFTMAX
|
MASK_LOG_TANH
|
MASK_LOG_LC
|
MASK_LOG_BN
|
MASK_LOG_CONCAT
|
0
);
#include <stdio.h>
/* printf */
#include <stdlib.h>
/* getenv */
#endif // USE_PROFILING
static
bool
force_enable_gpu
=
false
;
bool
AclEnableSchedule
(
int
enable
)
{
enable_schedule
=
enable
;
if
(
enable
)
{
force_enable_gpu
=
true
;
}
return
true
;
}
int
isScheduleEnable
()
{
return
enable_schedule
;
}
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
bool
ACLOperator
::
init_gpu_env
=
true
;
#ifdef USE_OPENCL
bool
ACLOperator
::
support_opencl_
=
false
;
bool
opencl_is_available
()
{
return
arm_compute
::
opencl_is_available
();
}
#elif defined(USE_OPENGLES)
bool
ACLOperator
::
support_opengles_
=
false
;
#endif
ACLOperator
::
ACLOperator
(
bool
is_gpu
)
:
operator_state_
(
operator_not_init
),
force_bypass_acl_path_
(
false
),
target_hint_
(
TargetHint
::
DONT_CARE
),
convolution_method_hint_
(
ConvolutionMethodHint
::
GEMM
),
_group
(
1
),
name_
(
""
),
input_idx_
(
0
),
output_idx_
(
0
),
is_gpu_
(
is_gpu
)
{
const
char
*
pBypassACL
;
if
(
init_gpu_env
)
{
#ifdef USE_OPENCL
try
{
if
(
opencl_is_available
())
{
arm_compute
::
CLScheduler
::
get
().
default_init
();
support_opencl_
=
true
;
}
}
catch
(
std
::
exception
&
e
)
{
support_opencl_
=
false
;
}
#elif defined(USE_OPENGLES)
try
{
arm_compute
::
GCScheduler
::
get
().
default_init
();
support_opengles_
=
true
;
}
catch
(
std
::
exception
&
e
)
{
support_opengles_
=
false
;
}
#endif
init_gpu_env
=
false
;
}
if
(
force_enable_gpu
)
is_gpu_
=
true
;
pBypassACL
=
getenv
(
"BYPASSACL"
);
if
(
pBypassACL
)
{
unsigned
int
bacl
;
sscanf
(
pBypassACL
,
"%i"
,
&
bacl
);
if
(
bacl
!=
bypass_acl_class_layer
)
{
bypass_acl_class_layer
=
bacl
;
printf
(
"BYPASSACL<%s>
\n
"
,
pBypassACL
);
printf
(
"BYPASSACL: %x
\n
"
,
bypass_acl_class_layer
);
}
}
#ifdef USE_PROFILING
const
char
*
pLogACL
;
pLogACL
=
getenv
(
"LOGACL"
);
if
(
pLogACL
)
{
unsigned
int
alf
;
sscanf
(
pLogACL
,
"%i"
,
&
alf
);
if
(
alf
!=
acl_log_flags
)
{
acl_log_flags
=
alf
;
printf
(
"LOGACL<%s>
\n
"
,
pLogACL
);
printf
(
"LOGACL: %x
\n
"
,
acl_log_flags
);
}
}
#endif // USE_PROFILING
const
char
*
pEnableSchedule
;
pEnableSchedule
=
getenv
(
"ENABLESCHEDULE"
);
if
(
pEnableSchedule
)
{
int
bshedule
;
sscanf
(
pEnableSchedule
,
"%i"
,
&
bshedule
);
if
(
bshedule
!=
enable_schedule
)
{
enable_schedule
=
bshedule
;
printf
(
"ENABLESCHEDULE<%s>
\n
"
,
pEnableSchedule
);
printf
(
"ENABLESCHEDULE: %x
\n
"
,
enable_schedule
);
}
if
(
enable_schedule
)
{
AclEnableSchedule
(
1
);
}
}
}
ACLOperator
::~
ACLOperator
()
{}
bool
ACLOperator
::
new_tensor
(
std
::
unique_ptr
<
ACLTensor
>&
tensor
,
arm_compute
::
TensorShape
&
shape
,
void
*
mem
,
bool
commit
)
{
auto
acl_tensor
=
new
ACLTensor
(
arm_compute
::
TensorInfo
(
shape
,
arm_compute
::
Format
::
F32
));
acl_tensor
->
set_target
(
getTargetHint
());
acl_tensor
->
bindmem
(
mem
);
if
(
commit
)
acl_tensor
->
commit
();
tensor
=
(
std
::
unique_ptr
<
ACLTensor
>
)
std
::
move
(
acl_tensor
);
return
true
;
}
bool
ACLOperator
::
new_tensor
(
std
::
unique_ptr
<
ACLSubTensor
>&
tensor
,
std
::
unique_ptr
<
ACLTensor
>&
parent
,
arm_compute
::
TensorShape
&
shape
,
arm_compute
::
Coordinates
&
coord
)
{
auto
acl_tensor
=
new
ACLSubTensor
(
parent
,
shape
,
coord
);
acl_tensor
->
set_target
(
getTargetHint
());
tensor
=
(
std
::
unique_ptr
<
ACLSubTensor
>
)
std
::
move
(
acl_tensor
);
return
true
;
}
void
ACLTensor
::
commit
(
TensorType
type
)
{
settensortype
(
type
);
if
(
mem_
)
{
if
(
!
allocate_
)
{
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_ALLOCATE_INFO
);
#endif // USE_PROFILING
allocate
();
allocate_
=
true
;
}
if
(
type_
!=
tensor_output
)
{
tensor_copy
(
mem_
);
}
mem_
=
nullptr
;
}
}
int
BaseACLTensor
::
tensor_copy
(
arm_compute
::
ITensor
*
tensor
,
void
*
mem
,
bool
toTensor
)
{
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_COPY_INFO
);
#endif // USE_PROFILING
arm_compute
::
Window
window
;
// Iterate through the rows (not each element)
window
.
use_tensor_dimensions
(
tensor
->
info
()
->
tensor_shape
(),
/* first_dimension =*/
arm_compute
::
Window
::
DimY
);
int
width
=
tensor
->
info
()
->
tensor_shape
()[
0
];
int
height
=
tensor
->
info
()
->
tensor_shape
()[
1
];
int
deepth
=
tensor
->
info
()
->
tensor_shape
()[
2
];
map
();
// Create an iterator:
arm_compute
::
Iterator
it
(
tensor
,
window
);
// Except it works for an arbitrary number of dimensions
if
(
toTensor
)
{
// mem->tensor
arm_compute
::
execute_window_loop
(
window
,
[
&
](
const
arm_compute
::
Coordinates
&
id
)
{
memcpy
(
it
.
ptr
(),
((
char
*
)
mem
)
+
((
id
[
3
]
*
(
width
*
height
*
deepth
)
+
id
.
z
()
*
(
width
*
height
)
+
id
.
y
()
*
width
+
id
.
x
())
*
tensor
->
info
()
->
element_size
()),
width
*
tensor
->
info
()
->
element_size
());
},
it
);
}
else
{
// tensor-->mem
arm_compute
::
execute_window_loop
(
window
,
[
&
](
const
arm_compute
::
Coordinates
&
id
)
{
memcpy
(((
char
*
)
mem
)
+
((
id
[
3
]
*
(
width
*
height
*
deepth
)
+
id
.
z
()
*
(
width
*
height
)
+
id
.
y
()
*
width
)
*
tensor
->
info
()
->
element_size
()),
it
.
ptr
(),
width
*
tensor
->
info
()
->
element_size
());
},
it
);
}
unmap
();
return
0
;
}
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/acl_operator.h
0 → 100644
浏览文件 @
dd0cf789
此差异已折叠。
点击以展开。
src/operators/kernel/mali/acl_tensor.cc
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "acl_tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
#ifdef USE_ACL
template
<
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
ITensor
>
initialise_tensor
(
arm_compute
::
TensorInfo
&
info
)
{
auto
tensor
=
cpp14
::
make_unique
<
TensorType
>
();
tensor
->
allocator
()
->
init
(
info
);
return
std
::
move
(
tensor
);
}
template
<
typename
TensorType
>
void
tensor_allocate
(
arm_compute
::
ITensor
&
tensor
)
{
auto
itensor
=
dynamic_cast
<
TensorType
*>
(
&
tensor
);
itensor
->
allocator
()
->
allocate
();
}
Tensor
::
Tensor
(
arm_compute
::
TensorInfo
&
info
)
noexcept
:
_target
(
TargetHint
::
DONT_CARE
),
_info
(
info
),
_tensor
(
nullptr
)
{}
Tensor
::
Tensor
(
Tensor
&&
src
)
noexcept
:
_target
(
src
.
_target
),
_info
(
std
::
move
(
src
.
_info
)),
_tensor
(
std
::
move
(
src
.
_tensor
))
{}
arm_compute
::
ITensor
*
Tensor
::
set_target
(
TargetHint
target
)
{
switch
(
target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
_tensor
=
initialise_tensor
<
arm_compute
::
CLTensor
>
(
_info
);
break
;
#elif defined(USE_OPENGLES)
case
TargetHint
::
OPENGLES
:
_tensor
=
initialise_tensor
<
arm_compute
::
GCTensor
>
(
_info
);
break
;
#endif
case
TargetHint
::
NEON
:
_tensor
=
initialise_tensor
<
arm_compute
::
Tensor
>
(
_info
);
break
;
default:
break
;
}
_target
=
target
;
return
_tensor
.
get
();
}
void
Tensor
::
allocate
()
{
switch
(
_target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
tensor_allocate
<
arm_compute
::
CLTensor
>
(
*
_tensor
);
break
;
#elif defined(USE_OPENGLES)
case
TargetHint
::
OPENGLES
:
tensor_allocate
<
arm_compute
::
GCTensor
>
(
*
_tensor
);
break
;
#endif
case
TargetHint
::
NEON
:
tensor_allocate
<
arm_compute
::
Tensor
>
(
*
_tensor
);
break
;
default:
break
;
}
}
void
Tensor
::
map
(
bool
blocking
)
{
#ifdef USE_OPENCL
if
(
_target
==
TargetHint
::
OPENCL
)
dynamic_cast
<
arm_compute
::
CLTensor
*>
(
tensor
())
->
map
(
blocking
);
#elif defined(USE_OPENGLES)
if
(
_target
==
TargetHint
::
OPENGLES
)
dynamic_cast
<
arm_compute
::
GCTensor
*>
(
tensor
())
->
map
(
blocking
);
#endif
}
void
Tensor
::
unmap
()
{
#ifdef USE_OPENCL
if
(
_target
==
TargetHint
::
OPENCL
)
dynamic_cast
<
arm_compute
::
CLTensor
*>
(
tensor
())
->
unmap
();
#elif defined(USE_OPENGLES)
if
(
_target
==
TargetHint
::
OPENGLES
)
dynamic_cast
<
arm_compute
::
GCTensor
*>
(
tensor
())
->
unmap
();
#endif
}
template
<
typename
SubTensorType
,
typename
ParentTensorType
>
std
::
unique_ptr
<
arm_compute
::
ITensor
>
initialise_subtensor
(
arm_compute
::
ITensor
*
parent
,
arm_compute
::
TensorShape
shape
,
arm_compute
::
Coordinates
coords
)
{
auto
ptensor
=
dynamic_cast
<
ParentTensorType
*>
(
parent
);
auto
subtensor
=
cpp14
::
make_unique
<
SubTensorType
>
(
ptensor
,
shape
,
coords
);
return
std
::
move
(
subtensor
);
}
SubTensor
::
SubTensor
(
Tensor
*
parent
,
arm_compute
::
TensorShape
&
tensor_shape
,
arm_compute
::
Coordinates
&
coords
)
noexcept
:
_target
(
TargetHint
::
DONT_CARE
),
_tensor_shape
(
tensor_shape
),
_coords
(
coords
),
_parent
(
nullptr
),
_subtensor
(
nullptr
)
{
_parent
=
parent
->
tensor
();
_target
=
parent
->
target
();
instantiate_subtensor
();
}
arm_compute
::
ITensor
*
SubTensor
::
set_target
(
TargetHint
target
)
{
return
(
target
==
_target
)
?
_subtensor
.
get
()
:
nullptr
;
}
arm_compute
::
ITensor
*
SubTensor
::
tensor
()
{
return
_subtensor
.
get
();
}
const
arm_compute
::
ITensor
*
SubTensor
::
tensor
()
const
{
return
_subtensor
.
get
();
}
TargetHint
SubTensor
::
target
()
const
{
return
_target
;
}
void
SubTensor
::
allocate
()
{
// NOP for sub-tensors
}
void
SubTensor
::
instantiate_subtensor
()
{
switch
(
_target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
_subtensor
=
initialise_subtensor
<
arm_compute
::
CLSubTensor
,
arm_compute
::
ICLTensor
>
(
_parent
,
_tensor_shape
,
_coords
);
break
;
#endif
default:
case
TargetHint
::
NEON
:
_subtensor
=
initialise_subtensor
<
arm_compute
::
SubTensor
,
arm_compute
::
ITensor
>
(
_parent
,
_tensor_shape
,
_coords
);
break
;
}
}
#endif
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
src/operators/kernel/mali/acl_tensor.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_TENSOR_H_
#define ACL_TENSOR_H_
#ifdef USE_ACL
#ifdef USE_OPENCL
#include "arm_compute/runtime/CL/CLSubTensor.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#elif defined(USE_OPENGLES)
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#endif
#include "arm_compute/runtime/SubTensor.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
enum
class
TargetHint
{
DONT_CARE
,
OPENCL
,
OPENGLES
,
NEON
,
};
enum
class
ConvolutionMethodHint
{
GEMM
,
DIRECT
,
};
namespace
cpp14
{
template
<
class
T
>
struct
_Unique_if
{
typedef
std
::
unique_ptr
<
T
>
_Single_object
;
};
template
<
class
T
>
struct
_Unique_if
<
T
[]
>
{
typedef
std
::
unique_ptr
<
T
[]
>
_Unknown_bound
;
};
template
<
class
T
,
size_t
N
>
struct
_Unique_if
<
T
[
N
]
>
{
typedef
void
_Known_bound
;
};
template
<
class
T
,
class
...
Args
>
typename
_Unique_if
<
T
>::
_Single_object
make_unique
(
Args
&&
...
args
)
{
return
std
::
unique_ptr
<
T
>
(
new
T
(
std
::
forward
<
Args
>
(
args
)...));
}
template
<
class
T
>
typename
_Unique_if
<
T
>::
_Unknown_bound
make_unique
(
size_t
n
)
{
typedef
typename
std
::
remove_extent
<
T
>::
type
U
;
return
std
::
unique_ptr
<
T
>
(
new
U
[
n
]());
}
template
<
class
T
,
class
...
Args
>
typename
_Unique_if
<
T
>::
_Known_bound
make_unique
(
Args
&&
...);
}
// namespace cpp14
class
Tensor
{
public:
explicit
Tensor
(
arm_compute
::
TensorInfo
&
info
)
noexcept
;
virtual
~
Tensor
()
{}
Tensor
(
Tensor
&&
src
)
noexcept
;
void
set_info
(
arm_compute
::
TensorInfo
&&
info
)
{
_info
=
info
;
}
arm_compute
::
ITensor
*
set_target
(
TargetHint
target
);
const
arm_compute
::
TensorInfo
&
info
()
const
{
return
_info
;
}
arm_compute
::
ITensor
*
tensor
()
{
return
_tensor
.
get
();
}
void
allocate
();
void
init
()
{}
TargetHint
target
()
const
{
return
_target
;
}
virtual
void
map
(
bool
blocking
=
true
);
virtual
void
unmap
();
private:
TargetHint
_target
;
arm_compute
::
TensorInfo
_info
;
std
::
unique_ptr
<
arm_compute
::
ITensor
>
_tensor
;
};
class
SubTensor
{
public:
SubTensor
(
Tensor
*
parent
,
arm_compute
::
TensorShape
&
tensor_shape
,
arm_compute
::
Coordinates
&
coords
)
noexcept
;
~
SubTensor
()
{}
arm_compute
::
ITensor
*
tensor
();
const
arm_compute
::
ITensor
*
tensor
()
const
;
TargetHint
target
()
const
;
void
allocate
();
arm_compute
::
ITensor
*
set_target
(
TargetHint
target
);
private:
/** Instantiates a sub-tensor */
void
instantiate_subtensor
();
private:
/**< Target that this tensor is pinned on */
TargetHint
_target
;
/**< SubTensor shape */
arm_compute
::
TensorShape
_tensor_shape
;
/**< SubTensor Coordinates */
arm_compute
::
Coordinates
_coords
;
/**< Parent tensor */
arm_compute
::
ITensor
*
_parent
;
/**< SubTensor */
std
::
unique_ptr
<
arm_compute
::
ITensor
>
_subtensor
;
};
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif // ACL_TENSOR_H_
src/operators/kernel/mali/batchnorm_kernel.cpp
浏览文件 @
dd0cf789
...
...
@@ -16,20 +16,152 @@ limitations under the License. */
#include "operators/kernel/batchnorm_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclBatchNormOp
:
public
acl
::
ACLOperator
{
public:
AclBatchNormOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_BN
;
}
~
AclBatchNormOp
()
=
default
;
AclBatchNormOp
(
const
AclBatchNormOp
&
)
=
delete
;
AclBatchNormOp
&
operator
=
(
const
AclBatchNormOp
&
)
=
delete
;
AclBatchNormOp
(
AclBatchNormOp
&&
)
=
delete
;
AclBatchNormOp
&
operator
=
(
AclBatchNormOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
BatchNormParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
arm_compute
::
TensorShape
mean_shape
(
args
.
in_depth
);
arm_compute
::
TensorShape
var_shape
=
mean_shape
;
arm_compute
::
TensorShape
beta_shape
=
mean_shape
;
arm_compute
::
TensorShape
gamma_shape
=
mean_shape
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
new_tensor
(
mean
(),
mean_shape
,
args
.
mean_data
);
new_tensor
(
var
(),
var_shape
,
args
.
var_data
);
new_tensor
(
beta
(),
beta_shape
,
args
.
biases_data
);
new_tensor
(
gamma
(),
gamma_shape
,
args
.
weight_data
);
acl_configure
(
bn
,
this
,
args
.
epsilon
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
BatchNormParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
BatchNormParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
InputX
();
Tensor
*
out
=
param
.
OutputY
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
const
Tensor
*
saved_mean
=
param
.
InputMean
();
const
Tensor
*
saved_variance
=
param
.
InputVariance
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
scale
->
data
<
T
>
();
const
T
*
bias_data
=
bias
->
data
<
T
>
();
const
T
*
mean_data
=
saved_mean
->
data
<
T
>
();
const
T
*
var_data
=
saved_variance
->
data
<
T
>
();
float
epsilon
=
param
.
Epsilon
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
// args.weight_data = (void*)weight_data;
// args.biases_data = (void*)bias_data;
args
.
mean_data
=
(
void
*
)
mean_data
;
args
.
var_data
=
(
void
*
)
var_data
;
args
.
epsilon
=
epsilon
;
args
.
dim
=
in_x
->
dims
().
size
();
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
args
.
out_num
=
out
->
dims
()[
0
];
args
.
out_depth
=
out
->
dims
()[
1
];
args
.
out_rows
=
out
->
dims
()[
2
];
args
.
out_cols
=
out
->
dims
()[
3
];
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
(
void
*
)
bias_data
;
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
para
)
const
{
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
param
)
const
{
AclBatchNormOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclBatchNormOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclBatchNormOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
BatchNormKernel
<
GPU_MALI
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{}
const
BatchNormParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclBatchNormOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclBatchNormOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
BatchNormKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/concat_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConcatOp
:
public
acl
::
ACLOperator
{
public:
AclConcatOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONCAT
;
}
~
AclConcatOp
()
=
default
;
AclConcatOp
(
const
AclConcatOp
&
)
=
delete
;
AclConcatOp
&
operator
=
(
const
AclConcatOp
&
)
=
delete
;
AclConcatOp
(
AclConcatOp
&&
)
=
delete
;
AclConcatOp
&
operator
=
(
AclConcatOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ConcatParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
const
std
::
vector
<
framework
::
LoDTensor
*>*
input_data
=
&
args
.
in_tensor
;
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
batch
);
if
(
is_operator_init_done
(
output_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
T
type
;
for
(
int
i
=
0
;
i
<
input_data
->
size
();
i
++
)
{
const
T
*
idata
=
(
*
input_data
)[
i
]
->
data
<
T
>
();
const
T
*
pdata
=
(
*
input_data
)[
i
]
->
data
<
T
>
();
int
in_batch
=
(
*
input_data
)[
i
]
->
dims
()[
0
];
int
in_channels
=
(
*
input_data
)[
i
]
->
dims
()[
1
];
int
in_width
=
(
*
input_data
)[
i
]
->
dims
()[
2
];
int
in_height
=
(
*
input_data
)[
i
]
->
dims
()[
3
];
arm_compute
::
TensorShape
in_shape
(
in_width
,
in_height
,
in_channels
);
new_tensor
(
cinput
(
i
),
in_shape
,
acl
::
InputdataPtr
(
this
,
args
.
in_tensor
,
type
,
i
));
}
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
concat
,
this
,
input_data
->
size
());
}
void
RunAcl
(
const
std
::
vector
<
framework
::
LoDTensor
*>&
input
,
void
*
output
)
{
T
type
;
acl
::
acl_run
(
this
,
input
,
output
,
type
);
}
bool
Bypass_acl
(
const
ConcatParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
!
args
.
is_channel_concat
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
ConcatParam
&
param
)
{
auto
inputs
=
param
.
Inputs
();
auto
*
output
=
param
.
Out
();
int64_t
axis
=
param
.
Axis
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
args
.
is_channel_concat
=
(
axis
==
1
);
args
.
in_tensor
=
inputs
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
batch
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConcatKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConcatParam
&
param
)
const
{
AclConcatOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConcatOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConcatOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConcatKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConcatParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConcatOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConcatOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
std
::
vector
<
framework
::
LoDTensor
*>
temp_data
=
args
.
in_tensor
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
(
temp_data
,
(
void
*
)
output_data
);
}
template
class
ConcatKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/conv_add_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConvAddOp
:
public
acl
::
ACLOperator
{
public:
AclConvAddOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONV
;
}
~
AclConvAddOp
()
=
default
;
AclConvAddOp
(
const
AclConvAddOp
&
)
=
delete
;
AclConvAddOp
&
operator
=
(
const
AclConvAddOp
&
)
=
delete
;
AclConvAddOp
(
AclConvAddOp
&&
)
=
delete
;
AclConvAddOp
&
operator
=
(
AclConvAddOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
FusionConvAddParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
arm_compute
::
TensorShape
weights_shape
(
args
.
filter_cols
,
args
.
filter_rows
,
args
.
in_depth
/
args
.
num_group
,
args
.
out_depth
);
arm_compute
::
TensorShape
biases_shape
(
args
.
out_depth
);
arm_compute
::
PadStrideInfo
conv_info
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
FLOOR
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
check_direct_conv
();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor
(
weights
(),
weights_shape
,
args
.
weight_data
);
//[OFM]
if
(
args
.
biases_data
)
{
new_tensor
(
biases
(),
biases_shape
,
args
.
biases_data
);
}
group
()
=
args
.
num_group
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
conv
,
this
,
conv_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
FusionConvAddParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
args
.
num_group
>=
5
)
{
bypass_acl
=
true
;
}
if
(
args
.
dim
>
2
)
{
bypass_acl
=
true
;
}
if
(
args
.
dilated
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
check_direct_conv
()
{
bool
use_direct_conv
=
false
;
const
char
*
pDirectConv
;
pDirectConv
=
getenv
(
"DIRECTCONV"
);
if
(
pDirectConv
)
{
unsigned
int
bdirectconv
;
sscanf
(
pDirectConv
,
"%i"
,
&
bdirectconv
);
if
(
bdirectconv
!=
use_direct_conv
)
{
use_direct_conv
=
bdirectconv
;
printf
(
"DIRECTCONV<%s>
\n
"
,
pDirectConv
);
printf
(
"DIRECTCONV: %x
\n
"
,
use_direct_conv
);
}
}
int
pad_data
[
2
],
kernel
[
2
];
pad_data
[
1
]
=
args
.
pad_rows
;
pad_data
[
0
]
=
args
.
pad_cols
;
kernel
[
1
]
=
args
.
filter_rows
;
kernel
[
0
]
=
args
.
filter_cols
;
if
(
use_direct_conv
&&
((
kernel
[
0
]
==
1
&&
kernel
[
1
]
==
1
&&
pad_data
[
0
]
==
0
&&
pad_data
[
1
]
==
0
)
||
(
kernel
[
0
]
==
3
&&
kernel
[
1
]
==
3
&&
pad_data
[
0
]
<=
1
&&
pad_data
[
1
]
<=
1
)))
{
setConvMethod
();
// NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void
AclParametersByContext
(
const
FusionConvAddParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
bias
;
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
filter
.
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
nullptr
;
try
{
bias
=
param
.
Bias
();
}
catch
(
const
std
::
exception
&
e
)
{
}
if
(
bias
)
{
const
T
*
biases_data
=
bias
->
data
<
T
>
();
args
.
biases_data
=
(
void
*
)
biases_data
;
}
args
.
num_group
=
groups
;
args
.
dilation_rows
=
dilations
[
0
];
args
.
dilation_cols
=
dilations
[
1
];
if
(
dilations
[
0
]
!=
1
||
dilations
[
1
]
!=
1
)
{
args
.
dilated
=
true
;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args
.
batch
=
input
->
dims
()[
0
];
args
.
in_depth
=
input
->
dims
()[
1
];
args
.
in_rows
=
input
->
dims
()[
2
];
args
.
in_cols
=
input
->
dims
()[
3
];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args
.
out_num
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args
.
filter_rows
=
filter
.
dims
()[
2
];
args
.
filter_cols
=
filter
.
dims
()[
3
];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConvAddKernel
<
GPU_MALI
,
float
>::
Init
(
const
FusionConvAddParam
&
param
)
const
{
AclConvAddOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvAddOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConvAddOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConvAddKernel
<
GPU_MALI
,
float
>::
Compute
(
const
FusionConvAddParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConvAddOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvAddOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ConvAddKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/conv_kernel.cpp
浏览文件 @
dd0cf789
...
...
@@ -15,20 +15,213 @@ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConvOp
:
public
acl
::
ACLOperator
{
public:
AclConvOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONV
;
}
~
AclConvOp
()
=
default
;
AclConvOp
(
const
AclConvOp
&
)
=
delete
;
AclConvOp
&
operator
=
(
const
AclConvOp
&
)
=
delete
;
AclConvOp
(
AclConvOp
&&
)
=
delete
;
AclConvOp
&
operator
=
(
AclConvOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ConvParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
arm_compute
::
TensorShape
weights_shape
(
args
.
filter_cols
,
args
.
filter_rows
,
args
.
in_depth
/
args
.
num_group
,
args
.
out_depth
);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
PadStrideInfo
conv_info
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
FLOOR
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
check_direct_conv
();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor
(
weights
(),
weights_shape
,
args
.
weight_data
);
//[OFM]
// if (args.biases_data) {
// new_tensor(biases(),biases_shape,args.biases_data);
//}
group
()
=
args
.
num_group
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
conv
,
this
,
conv_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
ConvParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
args
.
num_group
>=
5
)
{
bypass_acl
=
true
;
}
if
(
args
.
dim
>
2
)
{
bypass_acl
=
true
;
}
if
(
args
.
dilated
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
check_direct_conv
()
{
bool
use_direct_conv
=
false
;
const
char
*
pDirectConv
;
pDirectConv
=
getenv
(
"DIRECTCONV"
);
if
(
pDirectConv
)
{
unsigned
int
bdirectconv
;
sscanf
(
pDirectConv
,
"%i"
,
&
bdirectconv
);
if
(
bdirectconv
!=
use_direct_conv
)
{
use_direct_conv
=
bdirectconv
;
printf
(
"DIRECTCONV<%s>
\n
"
,
pDirectConv
);
printf
(
"DIRECTCONV: %x
\n
"
,
use_direct_conv
);
}
}
int
pad_data
[
2
],
kernel
[
2
];
pad_data
[
1
]
=
args
.
pad_rows
;
pad_data
[
0
]
=
args
.
pad_cols
;
kernel
[
1
]
=
args
.
filter_rows
;
kernel
[
0
]
=
args
.
filter_cols
;
if
(
use_direct_conv
&&
((
kernel
[
0
]
==
1
&&
kernel
[
1
]
==
1
&&
pad_data
[
0
]
==
0
&&
pad_data
[
1
]
==
0
)
||
(
kernel
[
0
]
==
3
&&
kernel
[
1
]
==
3
&&
pad_data
[
0
]
<=
1
&&
pad_data
[
1
]
<=
1
)))
{
setConvMethod
();
// NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void
AclParametersByContext
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
filter
.
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
nullptr
;
// try {
// bias = context.Input<framework::Tensor>("Bias");
// } catch (const std::exception& e) {
// }
// if (bias) {
// const T* biases_data = bias->data<T>();
// args.biases_data = (void*)biases_data;
// }
args
.
num_group
=
groups
;
args
.
dilation_rows
=
dilations
[
0
];
args
.
dilation_cols
=
dilations
[
1
];
if
(
dilations
[
0
]
!=
1
||
dilations
[
1
]
!=
1
)
{
args
.
dilated
=
true
;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args
.
batch
=
input
->
dims
()[
0
];
args
.
in_depth
=
input
->
dims
()[
1
];
args
.
in_rows
=
input
->
dims
()[
2
];
args
.
in_cols
=
input
->
dims
()[
3
];
std
::
cout
<<
"In N: "
<<
args
.
batch
<<
" C: "
<<
args
.
in_depth
<<
" H: "
<<
args
.
in_rows
<<
" W: "
<<
args
.
in_cols
<<
"
\n
"
;
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args
.
out_num
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args
.
filter_rows
=
filter
.
dims
()[
2
];
args
.
filter_cols
=
filter
.
dims
()[
3
];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
param
)
const
{
AclConvOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConvOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
// ArmConvImplement imp;
// imp.Compute(param);
param
.
Output
()
->
mutable_data
<
float
>
()[
0
]
=
100.0
;
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConvOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ConvKernel
<
GPU_MALI
,
float
>;
...
...
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/elementwise_add_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once
#include "operators/kernel/elementwise_add_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
T
>
struct
AddFunctor
{
inline
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
>
bool
ElementwiseAddKernel
<
GPU_MALI
,
float
>::
Init
(
const
ElementwiseAddParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ElementwiseAddKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ElementwiseAddParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
Tensor
*
Out
=
param
.
Out
();
Out
->
mutable_data
<
float
>
();
int
axis
=
param
.
Axis
();
ElementwiseComputeEx
<
AddFunctor
<
float
>
,
float
>
(
input_x
,
input_y
,
axis
,
AddFunctor
<
float
>
(),
Out
);
}
template
class
ElementwiseAddKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/fushion_fc_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
FusionFcKernel
<
GPU_MALI
,
float
>::
Init
(
const
FusionFcParam
&
para
)
const
{
return
true
;
}
template
<
>
void
FusionFcKernel
<
GPU_MALI
,
float
>::
Compute
(
const
FusionFcParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
const
Tensor
*
input_z
=
param
.
InputZ
();
auto
*
input_z_data
=
input_z
->
data
<
float
>
();
int
axis
=
param
.
Axis
();
Tensor
*
out
=
param
.
Out
();
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
:
*
input_x
;
const
Tensor
y_matrix
=
input_y
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_y
,
param
.
YNumColDims
())
:
*
input_y
;
auto
out_dim
=
out
->
dims
();
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
PADDLE_MOBILE_ENFORCE
(
input_z
->
dims
().
size
()
==
1
,
"inpu_z size must be 1"
);
PADDLE_MOBILE_ENFORCE
(
out_dim
[
1
]
==
input_z
->
dims
()[
0
],
" out_dim.size must be 2."
);
axis
=
(
axis
==
-
1
?
out_dim
.
size
()
-
input_z
->
dims
().
size
()
:
axis
);
PADDLE_MOBILE_ENFORCE
(
axis
==
1
,
" to fit broadcast, axis = 1. "
)
int64_t
classes
=
input_z
->
numel
();
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
DLOG
<<
out_data
[
i
];
}
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
));
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/lrn_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#pragma once
#include "operators/kernel/lrn_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclLrnOp
:
public
acl
::
ACLOperator
{
public:
AclLrnOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_LRN
;
}
~
AclLrnOp
()
=
default
;
AclLrnOp
(
const
AclLrnOp
&
)
=
delete
;
AclLrnOp
&
operator
=
(
const
AclLrnOp
&
)
=
delete
;
AclLrnOp
(
AclLrnOp
&&
)
=
delete
;
AclLrnOp
&
operator
=
(
AclLrnOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
LrnParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
);
if
(
is_operator_init_done
(
shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
arm_compute
::
NormalizationLayerInfo
norm_info
(
arm_compute
::
NormType
::
CROSS_MAP
,
args
.
nsize
,
args
.
alpha
,
args
.
beta
,
args
.
knorm
);
//[width, height, IFM]
new_tensor
(
input
(),
shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
shape
,
args
.
output_data
);
acl_configure
(
lrn
,
this
,
norm_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
LrnParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
LrnParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
InputX
();
Tensor
*
out
=
param
.
Out
();
int
n
=
param
.
N
();
T
alpha
=
param
.
Alpha
();
T
beta
=
param
.
Beta
();
T
k
=
param
.
K
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
nsize
=
n
;
args
.
alpha
=
alpha
;
args
.
beta
=
beta
;
args
.
knorm
=
k
;
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
LrnKernel
<
GPU_MALI
,
float
>::
Init
(
const
LrnParam
&
param
)
const
{
AclLrnOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclLrnOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclLrnOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
LrnKernel
<
GPU_MALI
,
float
>::
Compute
(
const
LrnParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclLrnOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclLrnOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
batch
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
output_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
}
}
template
class
LrnKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/mul_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#pragma once
#include "operators/kernel/mul_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
MulKernel
<
GPU_MALI
,
float
>::
Init
(
const
MulParam
&
para
)
const
{
return
true
;
}
template
<
>
void
MulKernel
<
GPU_MALI
,
float
>::
Compute
(
const
MulParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
Tensor
*
out
=
param
.
Out
();
out
->
mutable_data
<
float
>
();
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
:
*
input_x
;
const
Tensor
y_matrix
=
input_y
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_y
,
param
.
YNumColDims
())
:
*
input_y
;
auto
out_dim
=
out
->
dims
();
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
(
out_dim
);
}
}
template
class
MulKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/pool_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "operators/kernel/pool_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclPoolOp
:
public
acl
::
ACLOperator
{
public:
AclPoolOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_POOLING
;
}
~
AclPoolOp
()
=
default
;
AclPoolOp
(
const
AclPoolOp
&
)
=
delete
;
AclPoolOp
&
operator
=
(
const
AclPoolOp
&
)
=
delete
;
AclPoolOp
(
AclPoolOp
&&
)
=
delete
;
AclPoolOp
&
operator
=
(
AclPoolOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
PoolParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
PoolingLayerInfo
pool_info
;
if
(
args
.
pool_type
==
"max"
)
{
pool_info
=
arm_compute
::
PoolingLayerInfo
(
arm_compute
::
PoolingType
::
MAX
,
args
.
filter_rows
,
arm_compute
::
PadStrideInfo
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
CEIL
));
}
else
{
pool_info
=
arm_compute
::
PoolingLayerInfo
(
arm_compute
::
PoolingType
::
AVG
,
args
.
filter_rows
,
arm_compute
::
PadStrideInfo
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
CEIL
));
}
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
pooling
,
this
,
pool_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
PoolParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
if
(
args
.
pool_type
!=
"max"
&&
args
.
pool_type
!=
"avg"
)
{
bypass_acl
=
true
;
}
if
(
args
.
filter_rows
!=
args
.
filter_cols
)
{
bypass_acl
=
true
;
}
// if (args.filter_rows!=2 && args.filter_rows!=3) {
// bypass_acl = true;
// }
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
PoolParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
Input
();
Tensor
*
out
=
param
.
Output
();
std
::
string
pooling_type
=
param
.
PoolingType
();
std
::
vector
<
int
>
ksize
=
param
.
Ksize
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
bool
is_global_pooling
=
param
.
isGlobalPooling
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
is_global_pool
=
is_global_pooling
;
args
.
pool_type
=
pooling_type
;
args
.
filter_rows
=
ksize
[
0
];
args
.
filter_cols
=
ksize
[
1
];
args
.
dim
=
ksize
.
size
();
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
// std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
// << " I: " << static_cast<int>(filter->dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"PoolingType: " << args.pool_type << "\n";
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
args
.
out_depth
=
args
.
in_depth
;
// args.out_rows = out->dims()[2];
// args.out_cols = out->dims()[3];
args
.
out_rows
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
args
.
in_rows
+
2
*
args
.
pad_rows
-
args
.
filter_rows
)
/
args
.
stride_rows
))
+
1
;
args
.
out_cols
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
args
.
in_cols
+
2
*
args
.
pad_cols
-
args
.
filter_cols
)
/
args
.
stride_cols
))
+
1
;
if
(
is_global_pooling
)
{
args
.
filter_rows
=
args
.
in_rows
;
args
.
filter_cols
=
args
.
in_cols
;
args
.
pad_rows
=
0
;
args
.
pad_cols
=
0
;
}
}
acl
::
AclParameters
args
;
};
template
<
>
bool
PoolKernel
<
GPU_MALI
,
float
>::
Init
(
const
PoolParam
&
param
)
const
{
AclPoolOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclPoolOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclPoolOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
PoolKernel
<
GPU_MALI
,
float
>::
Compute
(
const
PoolParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclPoolOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclPoolOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
batch
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
output_data
+=
args
.
in_depth
*
args
.
out_cols
*
args
.
out_rows
;
}
}
template
class
PoolKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/relu_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include "operators/kernel/relu_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclReluOp
:
public
acl
::
ACLOperator
{
public:
AclReluOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_RELU
;
}
~
AclReluOp
()
=
default
;
AclReluOp
(
const
AclReluOp
&
)
=
delete
;
AclReluOp
&
operator
=
(
const
AclReluOp
&
)
=
delete
;
AclReluOp
(
AclReluOp
&&
)
=
delete
;
AclReluOp
&
operator
=
(
AclReluOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ReluParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
*
args
.
in_rows
*
args
.
in_depth
*
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
in_cols
*
args
.
in_rows
*
args
.
in_depth
*
args
.
out_num
);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
ActivationLayerInfo
::
ActivationFunction
type
;
type
=
arm_compute
::
ActivationLayerInfo
::
ActivationFunction
::
RELU
;
arm_compute
::
ActivationLayerInfo
act_info
(
type
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
activation
,
this
,
act_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
ReluParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
ReluParam
&
param
)
{
const
auto
*
input_x
=
param
.
InputX
();
auto
*
out
=
param
.
Out
();
const
T
*
input_data
=
input_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
batch
=
input_x
->
dims
()[
0
];
args
.
in_depth
=
input_x
->
dims
()[
1
];
args
.
in_rows
=
input_x
->
dims
()[
2
];
args
.
in_cols
=
input_x
->
dims
()[
3
];
args
.
out_num
=
out
->
dims
()[
0
];
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ReluKernel
<
GPU_MALI
,
float
>::
Init
(
const
ReluParam
&
param
)
const
{
AclReluOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclReluOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclReluOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ReluKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ReluParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclReluOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclReluOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ReluKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/reshape_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#pragma once
#include "operators/kernel/reshape_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ReshapeKernel
<
GPU_MALI
,
float
>::
Init
(
const
ReshapeParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ReshapeKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ReshapeParam
&
param
)
const
{
const
auto
*
input_x
=
param
.
InputX
();
const
auto
&
input_x_dims
=
input_x
->
dims
();
auto
*
out
=
param
.
Out
();
framework
::
DDim
out_dims
=
out
->
dims
();
const
auto
*
input_shape
=
param
.
InputShape
();
if
(
input_shape
)
{
auto
*
shape_data
=
input_shape
->
data
<
int
>
();
framework
::
Tensor
cpu_shape_tensor
;
auto
shape
=
std
::
vector
<
int
>
(
shape_data
,
shape_data
+
input_shape
->
numel
());
out_dims
=
ValidateShape
(
shape
,
input_x
->
dims
());
}
bool
inplace
=
param
.
Inplace
();
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
<
float
>
();
framework
::
TensorCopy
(
*
input_x
,
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
input_x
);
out
->
Resize
(
out_dims
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/softmax_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once
#include "operators/kernel/softmax_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclSoftmaxOp
:
public
acl
::
ACLOperator
{
public:
AclSoftmaxOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_SOFTMAX
;
}
~
AclSoftmaxOp
()
=
default
;
AclSoftmaxOp
(
const
AclSoftmaxOp
&
)
=
delete
;
AclSoftmaxOp
&
operator
=
(
const
AclSoftmaxOp
&
)
=
delete
;
AclSoftmaxOp
(
AclSoftmaxOp
&&
)
=
delete
;
AclSoftmaxOp
&
operator
=
(
AclSoftmaxOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
SoftmaxParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
shape
(
args
.
in_depth
,
args
.
batch
);
if
(
is_operator_init_done
(
shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
shape
,
args
.
output_data
);
acl_configure
(
softmax
,
this
,
NULL
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
SoftmaxParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
SoftmaxParam
&
param
)
{
const
framework
::
Tensor
*
in_x
=
param
.
InputX
();
framework
::
Tensor
*
out
=
param
.
Out
();
auto
x_dims
=
in_x
->
dims
();
out
->
Resize
(
x_dims
);
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
out_num
=
out
->
dims
()[
0
];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
SoftmaxKernel
<
GPU_MALI
,
float
>::
Init
(
const
SoftmaxParam
&
param
)
const
{
AclSoftmaxOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclSoftmaxOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclSoftmaxOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
SoftmaxKernel
<
GPU_MALI
,
float
>::
Compute
(
const
SoftmaxParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclSoftmaxOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclSoftmaxOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
out_num
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
;
output_data
+=
args
.
in_depth
;
}
}
template
class
SoftmaxKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/lrn_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
REGISTER_OPERATOR_CPU
(
lrn
,
ops
::
LrnOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
lrn
);
REGISTER_OPERATOR_MALI_GPU
(
lrn
,
ops
::
LrnOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/math/depthwiseconv3x3s1p1.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/depthwiseconv3x3s1p1.h"
#include <arm_neon.h>
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
void
DepthwiseConv3x3s1p1
(
const
Tensor
*
input
,
Tensor
filter
,
Tensor
*
output
,
Tensor
bias
,
bool
if_bias
)
{
const
float
*
input_data
=
input
->
data
<
float
>
();
const
float
*
filter_data
=
filter
.
data
<
float
>
();
float
*
output_data
=
output
->
data
<
float
>
();
const
float
*
bias_data
=
bias
.
data
<
float
>
();
const
int
h
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
const
int
w
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
l
=
h
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
const
int
c
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
const
int
hxw
=
h
*
w
;
float32x4_t
vbias
=
vdupq_n_f32
(
0.0
);
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
const
float
*
filter_data_tmp
=
filter_data
;
for
(
int
j
=
0
;
j
<
c
;
++
j
)
{
if
(
if_bias
)
{
vbias
=
vdupq_n_f32
(
bias_data
[
j
]);
}
int
l_mid
=
l
-
2
;
// l=1->l_mid=-1,l=2->l_mid=0
float
w00
=
filter_data_tmp
[
0
];
float
w01
=
filter_data_tmp
[
1
];
float
w02
=
filter_data_tmp
[
2
];
float
w10
=
filter_data_tmp
[
3
];
float
w11
=
filter_data_tmp
[
4
];
float
w12
=
filter_data_tmp
[
5
];
float
w20
=
filter_data_tmp
[
6
];
float
w21
=
filter_data_tmp
[
7
];
float
w22
=
filter_data_tmp
[
8
];
output_data
[
0
]
=
w11
*
input_data
[
0
]
+
w12
*
input_data
[
1
]
+
w21
*
input_data
[
l
]
+
w22
*
input_data
[
l
+
1
]
+
bias_data
[
j
];
output_data
[
l
-
1
]
=
w10
*
input_data
[
l
-
2
]
+
w11
*
input_data
[
l
-
1
]
+
w20
*
input_data
[
2
*
l
-
2
]
+
w21
*
input_data
[
2
*
l
-
1
]
+
bias_data
[
j
];
output_data
[(
l
-
1
)
*
l
]
=
w01
*
input_data
[(
l
-
2
)
*
l
]
+
w02
*
input_data
[(
l
-
2
)
*
l
+
1
]
+
w11
*
input_data
[(
l
-
1
)
*
l
]
+
w12
*
input_data
[(
l
-
1
)
*
l
+
1
]
+
bias_data
[
j
];
output_data
[
l
*
l
-
1
]
=
w00
*
input_data
[(
l
-
2
)
*
(
l
+
1
)]
+
w01
*
input_data
[(
l
-
2
)
*
(
l
+
1
)
+
1
]
+
w10
*
input_data
[
l
*
l
-
2
]
+
w11
*
input_data
[
l
*
l
-
1
]
+
bias_data
[
j
];
for
(
int
i
=
1
;
i
<
l
-
1
;
++
i
)
{
output_data
[
i
*
l
]
=
w01
*
input_data
[
i
*
l
-
l
]
+
w02
*
input_data
[
i
*
l
-
l
+
1
]
+
w11
*
input_data
[
i
*
l
]
+
w12
*
input_data
[
i
*
l
+
1
]
+
w21
*
input_data
[
i
*
l
+
l
]
+
w22
*
input_data
[
i
*
l
+
l
+
1
]
+
bias_data
[
j
];
output_data
[
i
*
l
+
l
-
1
]
=
w00
*
input_data
[
i
*
l
+
l
-
1
-
l
-
1
]
+
w01
*
input_data
[
i
*
l
+
l
-
1
-
l
]
+
w10
*
input_data
[
i
*
l
+
l
-
1
-
1
]
+
w11
*
input_data
[
i
*
l
+
l
-
1
]
+
w20
*
input_data
[
i
*
l
+
l
-
1
+
l
-
1
]
+
w21
*
input_data
[
i
*
l
+
l
-
1
+
l
]
+
bias_data
[
j
];
}
// top 1 row and bottom 1 row
const
float
*
input_tmp
=
input_data
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
out0
;
in0
=
vld1q_f32
(
input_tmp
);
in2
=
vld1q_f32
(
input_tmp
+
l
);
const
float
*
input_tmp_end
=
input_tmp
+
(
l
-
2
)
*
l
;
in4
=
vld1q_f32
(
input_tmp_end
);
in6
=
vld1q_f32
(
input_tmp_end
+
l
);
int
c_mid
=
l_mid
;
auto
output_ptr
=
output_data
+
1
;
for
(;
c_mid
>
3
;
c_mid
-=
4
)
{
in1
=
vld1q_f32
(
input_tmp
+
4
);
in3
=
vld1q_f32
(
input_tmp
+
l
+
4
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
out0
=
vmulq_n_f32
(
in0
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
vst1q_f32
(
output_ptr
,
out0
);
in5
=
vld1q_f32
(
input_tmp_end
+
4
);
in7
=
vld1q_f32
(
input_tmp_end
+
l
+
4
);
tmp0
=
vextq_f32
(
in4
,
in5
,
1
);
tmp1
=
vextq_f32
(
in4
,
in5
,
2
);
tmp2
=
vextq_f32
(
in6
,
in7
,
1
);
tmp3
=
vextq_f32
(
in6
,
in7
,
2
);
out0
=
vmulq_n_f32
(
in4
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in6
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vaddq_f32
(
out0
,
vbias
);
vst1q_f32
(
output_ptr
+
(
l
-
1
)
*
l
,
out0
);
// can optimize to each 8 stride.
input_tmp
+=
4
;
input_tmp_end
+=
4
;
output_ptr
+=
4
;
in0
=
in1
;
in2
=
in3
;
in4
=
in5
;
in6
=
in7
;
}
// top right pad
float32x4_t
pad0
=
vdupq_n_f32
(
input_data
[
l
-
1
]);
float32x4_t
pad1
=
vdupq_n_f32
(
input_data
[
2
*
l
-
1
]);
tmp0
=
vextq_f32
(
in0
,
pad0
,
1
);
tmp1
=
vextq_f32
(
in0
,
pad0
,
2
);
tmp2
=
vextq_f32
(
in2
,
pad1
,
1
);
tmp3
=
vextq_f32
(
in2
,
pad1
,
2
);
out0
=
vmulq_n_f32
(
in0
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
2
);
}
}
// bottom right pad
float32x4_t
pad2
=
vdupq_n_f32
(
input_data
[
l
*
l
-
1
-
l
]);
float32x4_t
pad3
=
vdupq_n_f32
(
input_data
[
l
*
l
-
1
]);
tmp0
=
vextq_f32
(
in4
,
pad2
,
1
);
tmp1
=
vextq_f32
(
in4
,
pad2
,
2
);
tmp2
=
vextq_f32
(
in6
,
pad3
,
1
);
tmp3
=
vextq_f32
(
in6
,
pad3
,
2
);
out0
=
vmulq_n_f32
(
in4
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in6
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vaddq_f32
(
out0
,
vbias
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
(
l
-
1
)
*
l
+
i
,
out0
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
(
l
-
1
)
*
l
+
i
,
out0
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
(
l
-
1
)
*
l
+
i
,
out0
,
2
);
}
}
// mid
for
(
int
i
=
0
;
i
<
l
-
2
;
++
i
)
{
auto
output_ptr
=
output_data
+
(
i
+
1
)
*
l
+
1
;
input_tmp
=
input_data
+
i
*
l
;
auto
in0_tmp
=
vld1q_f32
(
input_tmp
);
auto
in2_tmp
=
vld1q_f32
(
input_tmp
+
l
);
auto
in4_tmp
=
vld1q_f32
(
input_tmp
+
l
+
l
);
c_mid
=
l_mid
;
for
(;
c_mid
>
3
;
c_mid
-=
4
)
{
auto
in1_tmp
=
vld1q_f32
(
input_tmp
+
4
);
auto
in3_tmp
=
vld1q_f32
(
input_tmp
+
l
+
4
);
auto
in5_tmp
=
vld1q_f32
(
input_tmp
+
l
+
l
+
4
);
tmp0
=
vextq_f32
(
in0_tmp
,
in1_tmp
,
1
);
tmp1
=
vextq_f32
(
in0_tmp
,
in1_tmp
,
2
);
tmp2
=
vextq_f32
(
in2_tmp
,
in3_tmp
,
1
);
tmp3
=
vextq_f32
(
in2_tmp
,
in3_tmp
,
2
);
tmp4
=
vextq_f32
(
in4_tmp
,
in5_tmp
,
1
);
tmp5
=
vextq_f32
(
in4_tmp
,
in5_tmp
,
2
);
out0
=
vmulq_n_f32
(
in0_tmp
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in2_tmp
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in4_tmp
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp4
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp5
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
vst1q_f32
(
output_ptr
,
out0
);
output_ptr
+=
4
;
input_tmp
+=
4
;
in0_tmp
=
in1_tmp
;
in2_tmp
=
in3_tmp
;
in4_tmp
=
in5_tmp
;
}
float32x4_t
pad0
=
vdupq_n_f32
(
input_data
[
i
*
l
+
l
-
1
]);
float32x4_t
pad1
=
vdupq_n_f32
(
input_data
[
i
*
l
+
l
-
1
+
l
]);
float32x4_t
pad2
=
vdupq_n_f32
(
input_data
[
i
*
l
+
l
-
1
+
l
+
l
]);
tmp0
=
vextq_f32
(
in0_tmp
,
pad0
,
1
);
tmp1
=
vextq_f32
(
in0_tmp
,
pad0
,
2
);
tmp2
=
vextq_f32
(
in2_tmp
,
pad1
,
1
);
tmp3
=
vextq_f32
(
in2_tmp
,
pad1
,
2
);
tmp4
=
vextq_f32
(
in4_tmp
,
pad2
,
1
);
tmp5
=
vextq_f32
(
in4_tmp
,
pad2
,
2
);
out0
=
vmulq_n_f32
(
in0_tmp
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in2_tmp
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in4_tmp
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp4
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp5
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
2
);
}
}
}
output_data
+=
hxw
;
input_data
+=
hxw
;
filter_data_tmp
+=
9
;
}
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/depthwiseconv3x3s1p1.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
void
DepthwiseConv3x3s1p1
(
const
Tensor
*
input
,
Tensor
filter
,
Tensor
*
output
,
Tensor
bias
,
bool
if_bias
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/mul_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
REGISTER_OPERATOR_CPU
(
mul
,
ops
::
MulOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
mul
);
REGISTER_OPERATOR_MALI_GPU
(
mul
,
ops
::
MulOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/pool_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
REGISTER_OPERATOR_CPU
(
pool2d
,
ops
::
PoolOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
pool2d
);
REGISTER_OPERATOR_MALI_GPU
(
pool2d
,
ops
::
PoolOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/relu_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
REGISTER_OPERATOR_CPU
(
relu
,
ops
::
ReluOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
relu
);
REGISTER_OPERATOR_MALI_GPU
(
relu
,
ops
::
ReluOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/reshape_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
REGISTER_OPERATOR_CPU
(
reshape
,
ops
::
ReshapeOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
reshape
);
REGISTER_OPERATOR_MALI_GPU
(
reshape
,
ops
::
ReshapeOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/softmax_op.cpp
浏览文件 @
dd0cf789
...
...
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
REGISTER_OPERATOR_CPU
(
softmax
,
ops
::
SoftmaxOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
softmax
);
REGISTER_OPERATOR_MALI_GPU
(
softmax
,
ops
::
SoftmaxOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
tools/push2android.sh
→
tools/
android-debug-script/
push2android.sh
浏览文件 @
dd0cf789
#!/usr/bin/env sh
push_fn
()
{
MODELS_PATH
=
"../test/models/*"
MODELS_SRC
=
"../test/models"
IMAGE_PATH
=
"../test/images/*"
EXE_FILE
=
"../test/build/*"
MODELS_PATH
=
"../
../
test/models/*"
MODELS_SRC
=
"../
../
test/models"
IMAGE_PATH
=
"../
../
test/images/*"
EXE_FILE
=
"../
../
test/build/*"
EXE_DIR
=
"data/local/tmp/bin"
adb shell
mkdir
${
EXE_DIR
}
MODELS_DIR
=
"data/local/tmp/models"
...
...
@@ -14,9 +14,14 @@ do
adb shell
mkdir
${
MODELS_DIR
}
"/"
${
file
}
done
if
[[
-d
"../../src/operators/kernel/mali/ACL_Android/build"
]]
;
then
ACL_BUILD_PATH
=
"../../src/operators/kernel/mali/ACL_Android/build/*"
adb push
${
ACL_BUILD_PATH
}
${
EXE_DIR
}
fi
IMAGES_DIR
=
"data/local/tmp/images"
adb shell
mkdir
${
IMAGES_DIR
}
LIB_PATH
=
"../build/release/arm-v7a/build/*"
LIB_PATH
=
"../
../
build/release/arm-v7a/build/*"
adb push
${
EXE_FILE
}
${
EXE_DIR
}
adb push
${
LIB_PATH
}
${
EXE_DIR
}
if
[[
$1
!=
"npm"
]]
;
then
...
...
tools/
scripts
/run_on_android.sh
→
tools/
android-debug-script
/run_on_android.sh
浏览文件 @
dd0cf789
文件已移动
tools/build.sh
浏览文件 @
dd0cf789
...
...
@@ -56,7 +56,7 @@ build_for_android() {
MODE
=
"Release"
ANDROID_PLATFORM_VERSION
=
"android-
15
"
ANDROID_PLATFORM_VERSION
=
"android-
22
"
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE
=
"arm"
if
[
$#
-eq
1
]
;
then
...
...
tools/run.sh
已删除
100644 → 0
浏览文件 @
97102280
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录