Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
dd0cf789
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dd0cf789
编写于
6月 28, 2018
作者:
R
Ruilong Liu
提交者:
GitHub
6月 28, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
97102280
ccd7c7de
变更
51
隐藏空白更改
内联
并排
Showing
51 changed file
with
4246 addition
and
533 deletion
+4246
-533
.gitmodules
.gitmodules
+3
-0
CMakeLists.txt
CMakeLists.txt
+13
-2
src/framework/operator.h
src/framework/operator.h
+12
-0
src/operators/batchnorm_op.cpp
src/operators/batchnorm_op.cpp
+2
-0
src/operators/concat_op.cpp
src/operators/concat_op.cpp
+2
-0
src/operators/elementwise_add_op.cpp
src/operators/elementwise_add_op.cpp
+2
-0
src/operators/feed_op.h
src/operators/feed_op.h
+2
-0
src/operators/fetch_op.h
src/operators/fetch_op.h
+2
-0
src/operators/fusion_conv_add.cpp
src/operators/fusion_conv_add.cpp
+2
-0
src/operators/fusion_conv_add.h
src/operators/fusion_conv_add.h
+12
-0
src/operators/fusion_conv_add_relu_op.h
src/operators/fusion_conv_add_relu_op.h
+5
-0
src/operators/fusion_fc_op.cpp
src/operators/fusion_fc_op.cpp
+2
-0
src/operators/fusion_fc_op.h
src/operators/fusion_fc_op.h
+9
-1
src/operators/kernel/arm/batchnorm_kernel.cpp
src/operators/kernel/arm/batchnorm_kernel.cpp
+2
-210
src/operators/kernel/arm/conv_add_relu_kernel.cpp
src/operators/kernel/arm/conv_add_relu_kernel.cpp
+2
-86
src/operators/kernel/arm/conv_kernel.cpp
src/operators/kernel/arm/conv_kernel.cpp
+2
-82
src/operators/kernel/arm/depthwise_conv_kernel.cpp
src/operators/kernel/arm/depthwise_conv_kernel.cpp
+2
-86
src/operators/kernel/central-arm-func/batchnorm_arm_func.h
src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+234
-0
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
...perators/kernel/central-arm-func/conv_add_relu_arm_func.h
+117
-0
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+115
-0
src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
...erators/kernel/central-arm-func/depthwise_conv_arm_func.h
+116
-0
src/operators/kernel/conv_kernel.h
src/operators/kernel/conv_kernel.h
+0
-15
src/operators/kernel/mali/ACL_Android
src/operators/kernel/mali/ACL_Android
+1
-0
src/operators/kernel/mali/acl_operator.cc
src/operators/kernel/mali/acl_operator.cc
+220
-0
src/operators/kernel/mali/acl_operator.h
src/operators/kernel/mali/acl_operator.h
+1144
-0
src/operators/kernel/mali/acl_tensor.cc
src/operators/kernel/mali/acl_tensor.cc
+160
-0
src/operators/kernel/mali/acl_tensor.h
src/operators/kernel/mali/acl_tensor.h
+128
-0
src/operators/kernel/mali/batchnorm_kernel.cpp
src/operators/kernel/mali/batchnorm_kernel.cpp
+134
-2
src/operators/kernel/mali/concat_kernel.cpp
src/operators/kernel/mali/concat_kernel.cpp
+140
-0
src/operators/kernel/mali/conv_add_kernel.cpp
src/operators/kernel/mali/conv_add_kernel.cpp
+235
-0
src/operators/kernel/mali/conv_kernel.cpp
src/operators/kernel/mali/conv_kernel.cpp
+199
-5
src/operators/kernel/mali/elementwise_add_kernel.cpp
src/operators/kernel/mali/elementwise_add_kernel.cpp
+52
-0
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+77
-0
src/operators/kernel/mali/lrn_kernel.cpp
src/operators/kernel/mali/lrn_kernel.cpp
+148
-0
src/operators/kernel/mali/mul_kernel.cpp
src/operators/kernel/mali/mul_kernel.cpp
+59
-0
src/operators/kernel/mali/pool_kernel.cpp
src/operators/kernel/mali/pool_kernel.cpp
+220
-0
src/operators/kernel/mali/relu_kernel.cpp
src/operators/kernel/mali/relu_kernel.cpp
+136
-0
src/operators/kernel/mali/reshape_kernel.cpp
src/operators/kernel/mali/reshape_kernel.cpp
+60
-0
src/operators/kernel/mali/softmax_kernel.cpp
src/operators/kernel/mali/softmax_kernel.cpp
+137
-0
src/operators/lrn_op.cpp
src/operators/lrn_op.cpp
+2
-0
src/operators/math/depthwiseconv3x3s1p1.cpp
src/operators/math/depthwiseconv3x3s1p1.cpp
+288
-0
src/operators/math/depthwiseconv3x3s1p1.h
src/operators/math/depthwiseconv3x3s1p1.h
+27
-0
src/operators/mul_op.cpp
src/operators/mul_op.cpp
+2
-0
src/operators/pool_op.cpp
src/operators/pool_op.cpp
+2
-0
src/operators/relu_op.cpp
src/operators/relu_op.cpp
+2
-0
src/operators/reshape_op.cpp
src/operators/reshape_op.cpp
+2
-0
src/operators/softmax_op.cpp
src/operators/softmax_op.cpp
+2
-0
tools/android-debug-script/push2android.sh
tools/android-debug-script/push2android.sh
+10
-5
tools/android-debug-script/run_on_android.sh
tools/android-debug-script/run_on_android.sh
+0
-0
tools/build.sh
tools/build.sh
+1
-1
tools/run.sh
tools/run.sh
+0
-38
未找到文件。
.gitmodules
0 → 100644
浏览文件 @
dd0cf789
[submodule "src/operators/kernel/mali/ACL_Android"]
path = src/operators/kernel/mali/ACL_Android
url = https://github.com/halsay/ACL_Android.git
CMakeLists.txt
浏览文件 @
dd0cf789
...
@@ -7,7 +7,7 @@ option(USE_EXCEPTION "use std exception" ON)
...
@@ -7,7 +7,7 @@ option(USE_EXCEPTION "use std exception" ON)
option
(
LOG_PROFILE
"log profile"
ON
)
option
(
LOG_PROFILE
"log profile"
ON
)
# select the platform to build
# select the platform to build
option
(
CPU
"armv7 with neon"
ON
)
option
(
CPU
"armv7 with neon"
ON
)
option
(
MALI_GPU
"mali gpu"
O
FF
)
option
(
MALI_GPU
"mali gpu"
O
N
)
option
(
FPGA
"fpga"
OFF
)
option
(
FPGA
"fpga"
OFF
)
set
(
DEBUGING ON
)
set
(
DEBUGING ON
)
if
(
CPU
)
if
(
CPU
)
...
@@ -15,7 +15,18 @@ if (CPU)
...
@@ -15,7 +15,18 @@ if (CPU)
endif
()
endif
()
if
(
MALI_GPU
)
if
(
MALI_GPU
)
add_definitions
(
-DPADDLE_MOBILE_MALI_GPU
)
add_definitions
(
-DPADDLE_MOBILE_MALI_GPU
)
add_definitions
(
-DUSE_ACL=1
)
add_definitions
(
-DUSE_OPENCL
)
set
(
ACL_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/operators/kernel/mali/ACL_Android
)
include_directories
(
${
ACL_ROOT
}
${
ACL_ROOT
}
/include
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-L
${
ACL_ROOT
}
/build"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute_core"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute_graph"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-L
${
ACL_ROOT
}
/build/opencl-1.2-stubs"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-lOpenCL"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-DUSE_ACL=1"
)
endif
()
endif
()
if
(
FPGA
)
if
(
FPGA
)
...
...
src/framework/operator.h
浏览文件 @
dd0cf789
...
@@ -138,9 +138,21 @@ class OpKernelBase {
...
@@ -138,9 +138,21 @@ class OpKernelBase {
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */
* */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase
()
{
acl_op_
=
nullptr
;
}
void
*
GetAclOp
()
const
{
return
acl_op_
;
}
void
SetAclOp
(
void
*
op
,
void
*
ob
)
const
{
reinterpret_cast
<
OpKernelBase
<
Dtype
,
P
>
*>
(
ob
)
->
acl_op_
=
op
;
}
#endif
virtual
void
Compute
(
const
P
&
para
)
const
=
0
;
virtual
void
Compute
(
const
P
&
para
)
const
=
0
;
virtual
bool
Init
(
const
P
&
para
)
const
{
return
true
;
};
virtual
bool
Init
(
const
P
&
para
)
const
{
return
true
;
};
virtual
~
OpKernelBase
()
=
default
;
virtual
~
OpKernelBase
()
=
default
;
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void
*
acl_op_
;
#endif
};
};
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
...
...
src/operators/batchnorm_op.cpp
浏览文件 @
dd0cf789
...
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
...
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
REGISTER_OPERATOR_CPU
(
batch_norm
,
ops
::
BatchNormOp
);
REGISTER_OPERATOR_CPU
(
batch_norm
,
ops
::
BatchNormOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
batch_norm
);
REGISTER_OPERATOR_MALI_GPU
(
batch_norm
,
ops
::
BatchNormOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/concat_op.cpp
浏览文件 @
dd0cf789
...
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
...
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
REGISTER_OPERATOR_CPU
(
concat
,
ops
::
ConcatOp
);
REGISTER_OPERATOR_CPU
(
concat
,
ops
::
ConcatOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
concat
);
REGISTER_OPERATOR_MALI_GPU
(
concat
,
ops
::
ConcatOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/elementwise_add_op.cpp
浏览文件 @
dd0cf789
...
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
...
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
REGISTER_OPERATOR_CPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
REGISTER_OPERATOR_CPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
elementwise_add
);
REGISTER_OPERATOR_MALI_GPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/feed_op.h
浏览文件 @
dd0cf789
...
@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
...
@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
REGISTER_OPERATOR_CPU
(
feed
,
ops
::
FeedOp
);
REGISTER_OPERATOR_CPU
(
feed
,
ops
::
FeedOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
feed
);
REGISTER_OPERATOR_MALI_GPU
(
feed
,
ops
::
FeedOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/fetch_op.h
浏览文件 @
dd0cf789
...
@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
...
@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
REGISTER_OPERATOR_CPU
(
fetch
,
ops
::
FetchOp
);
REGISTER_OPERATOR_CPU
(
fetch
,
ops
::
FetchOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
fetch
);
REGISTER_OPERATOR_MALI_GPU
(
fetch
,
ops
::
FetchOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/fusion_conv_add.cpp
浏览文件 @
dd0cf789
...
@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
...
@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
REGISTER_OPERATOR_CPU
(
conv_add
,
ops
::
FusionConvAddOp
);
REGISTER_OPERATOR_CPU
(
conv_add
,
ops
::
FusionConvAddOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
conv_add
);
REGISTER_OPERATOR_MALI_GPU
(
conv_add
,
ops
::
FusionConvAddOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/fusion_conv_add.h
浏览文件 @
dd0cf789
...
@@ -68,11 +68,23 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
...
@@ -68,11 +68,23 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
};
};
#ifdef PADDLE_MOBILE_CPU
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_REGISTER
static
framework
::
FusionOpRegistrar
convadd_registrar
(
static
framework
::
FusionOpRegistrar
convadd_registrar
(
new
FusionConvAddMatcher
());
new
FusionConvAddMatcher
());
#define CONV_ADD_REGISTER
#endif
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_ADD_REGISTER
static
framework
::
FusionOpRegistrar
convadd_registrar
(
new
FusionConvAddMatcher
());
#define CONV_ADD_REGISTER
#endif
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/fusion_conv_add_relu_op.h
浏览文件 @
dd0cf789
...
@@ -64,8 +64,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
...
@@ -64,8 +64,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
};
};
#ifdef PADDLE_MOBILE_CPU
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_RELU_REGISTER
#define CONV_ADD_RELU_REGISTER
// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
// FusionConvAddReluOpMatcher());
// FusionConvAddReluOpMatcher());
#endif
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#endif
...
...
src/operators/fusion_fc_op.cpp
浏览文件 @
dd0cf789
...
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
...
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
REGISTER_OPERATOR_CPU
(
fc
,
ops
::
FusionFcOp
);
REGISTER_OPERATOR_CPU
(
fc
,
ops
::
FusionFcOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
fc
);
REGISTER_OPERATOR_MALI_GPU
(
fc
,
ops
::
FusionFcOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/fusion_fc_op.h
浏览文件 @
dd0cf789
...
@@ -66,11 +66,19 @@ class FusionFcOp
...
@@ -66,11 +66,19 @@ class FusionFcOp
};
};
#ifdef PADDLE_MOBILE_CPU
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static
framework
::
FusionOpRegistrar
fc_registrar
(
new
FusionFcMatcher
());
static
framework
::
FusionOpRegistrar
fc_registrar
(
new
FusionFcMatcher
());
#endif
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
// static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static
framework
::
FusionOpRegistrar
fc_registrar
(
new
FusionFcMatcher
());
#endif
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/kernel/arm/batchnorm_kernel.cpp
浏览文件 @
dd0cf789
...
@@ -16,7 +16,7 @@ limitations under the License. */
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include "operators/kernel/batchnorm_kernel.h"
#include "operators/kernel/batchnorm_kernel.h"
#include
<cmath>
#include
"operators/kernel/central-arm-func/batchnorm_arm_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -28,215 +28,7 @@ bool BatchNormKernel<CPU, float>::Init(const BatchNormParam ¶) const {
...
@@ -28,215 +28,7 @@ bool BatchNormKernel<CPU, float>::Init(const BatchNormParam ¶) const {
template
<
>
template
<
>
void
BatchNormKernel
<
CPU
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{
void
BatchNormKernel
<
CPU
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
BatchnormCompute
<
float
>
(
param
);
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
const
auto
&
x_dims
=
input_x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
H
=
x_dims
[
2
];
const
int
W
=
x_dims
[
3
];
const
int
stride0
=
C
*
H
*
W
;
const
int
stride1
=
H
*
W
;
const
int
stride2
=
W
;
Tensor
*
out
=
param
.
OutputY
();
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
const
float
epsilon
=
param
.
Epsilon
();
const
Tensor
*
mean
=
param
.
InputMean
();
const
Tensor
*
variance
=
param
.
InputVariance
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE
(
C
==
variance
->
numel
(),
"C must equal to variance.numel()"
);
int
HXW
=
H
*
W
;
if
(
HXW
>
32
)
{
int
NXC
=
N
*
C
;
float
*
inv_std_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_scale_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_bias_ptr
=
new
float
[
NXC
*
4
];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for
(
int
i
=
0
;
i
<
C
*
4
;
i
+=
4
)
{
int
index
=
i
/
4
;
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
index
]
+
epsilon
),
0.5
));
inv_std_ptr
[
i
+
1
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
2
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
3
]
=
inv_std_ptr
[
i
];
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_scale_ptr
[
i
+
1
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
2
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
3
]
=
new_scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
index
]
-
mean_ptr
[
index
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_bias_ptr
[
i
+
1
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
2
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
3
]
=
new_bias_ptr
[
i
];
}
for
(
int
j
=
C
*
4
;
j
<
NXC
*
4
;
++
j
)
{
new_scale_ptr
[
j
]
=
new_scale_ptr
[
j
-
C
*
4
];
new_bias_ptr
[
j
]
=
new_bias_ptr
[
j
-
C
*
4
];
}
asm
volatile
(
"subs %[N], %[N], #1
\n\t
"
"blt end_n_%=
\n\t
"
"loop_n_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"blt end_c_%=
\n\t
"
"loop_c_%=:
\n\t
"
"vld1.32 {q9}, [%[new_scale_ptr]]!
\n\t
"
"vld1.32 {q10}, [%[new_bias_ptr]]!
\n\t
"
"mov r6, %[HXW]
\n\t
"
"subs r6, r6, #32
\n\t
"
"blt end_hw_%=
\n\t
"
"loop_hw_%=:
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"subs r6, r6, #32
\n\t
"
"bge loop_hw_%=
\n\t
"
"end_hw_%=:
\n\t
"
"cmp r6, #0
\n\t
"
"bge end_remainder_%=
\n\t
"
"mov r5, #4
\n\t
"
"mul r6, r6, r5
\n\t
"
"add %[input_x_ptr], %[input_x_ptr], r6
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"add %[out_ptr], %[out_ptr], r6
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"end_remainder_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"bge loop_c_%=
\n\t
"
"end_c_%=:
\n\t
"
"subs %[N], %[N], #1
\n\t
"
"bge loop_n_%=
\n\t
"
"end_n_%=:
\n\t
"
:
:
[
input_x_ptr
]
"r"
(
input_x_ptr
),
[
out_ptr
]
"r"
(
out_ptr
),
[
new_scale_ptr
]
"r"
(
new_scale_ptr
),
[
new_bias_ptr
]
"r"
(
new_bias_ptr
),
[
N
]
"r"
(
N
),
[
C
]
"r"
(
C
),
[
HXW
]
"r"
(
HXW
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"r5"
,
"r6"
);
delete
[]
inv_std_ptr
;
delete
[]
new_scale_ptr
;
delete
[]
new_bias_ptr
;
}
else
{
float
*
inv_std_ptr
=
new
float
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
Tensor
new_scale
;
auto
new_scale_ptr
=
new_scale
.
mutable_data
<
float
>
(
make_ddim
({
C
}));
Tensor
new_bias
;
auto
new_bias_ptr
=
new_bias
.
mutable_data
<
float
>
(
make_ddim
({
C
}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
int
tmp_index
=
n
*
stride0
+
i
*
stride1
+
h
*
stride2
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
int
index
=
tmp_index
+
w
;
out_ptr
[
index
]
=
input_x_ptr
[
index
]
*
new_scale_ptr
[
i
]
+
new_bias_ptr
[
i
];
}
}
}
}
}
delete
[]
inv_std_ptr
;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102];
}
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/arm/conv_add_relu_kernel.cpp
浏览文件 @
dd0cf789
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#ifdef FUSION_CONVADD_RELU_OP
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -28,92 +29,7 @@ bool ConvAddReluKernel<CPU, float>::Init(
...
@@ -28,92 +29,7 @@ bool ConvAddReluKernel<CPU, float>::Init(
template
<
>
template
<
>
void
ConvAddReluKernel
<
CPU
,
float
>::
Compute
(
void
ConvAddReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddReluParam
&
param
)
const
{
const
FusionConvAddReluParam
&
param
)
const
{
const
Tensor
*
input
=
param
.
Input
();
ConvAddReluCompute
<
float
>
(
param
);
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
math
::
expand_bias
(
bias
,
axis
,
output
->
dims
());
output
->
ShareDataWith
(
bias
);
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
);
}
}
}
}
template
class
ConvAddReluKernel
<
CPU
,
float
>;
template
class
ConvAddReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/conv_kernel.cpp
浏览文件 @
dd0cf789
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef CONV_OP
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#include "operators/kernel/conv_kernel.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -26,88 +27,7 @@ bool ConvKernel<CPU, float>::Init(const ConvParam ¶) const {
...
@@ -26,88 +27,7 @@ bool ConvKernel<CPU, float>::Init(const ConvParam ¶) const {
template
<
>
template
<
>
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
const
Tensor
*
input
=
param
.
Input
();
ConvCompute
<
float
>
(
param
);
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
}
}
template
class
ConvKernel
<
CPU
,
float
>;
template
class
ConvKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/depthwise_conv_kernel.cpp
浏览文件 @
dd0cf789
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef DEPTHWISECONV_OP
#ifdef DEPTHWISECONV_OP
#include "operators/kernel/depthwise_conv_kernel.h"
#include "operators/kernel/depthwise_conv_kernel.h"
#include "operators/kernel/c
onv_kernel
.h"
#include "operators/kernel/c
entral-arm-func/depthwise_conv_arm_func
.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -27,91 +27,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam ¶) const {
...
@@ -27,91 +27,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam ¶) const {
template
<
>
template
<
>
void
DepthwiseConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
void
DepthwiseConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
LOG
(
kLOG_DEBUG
)
<<
param
;
DepthwiseConvCompute
<
float
>
(
param
);
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
// DLOG << " compute end get Attrs " << strides[0];
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
}
}
template
class
DepthwiseConvKernel
<
CPU
,
float
>;
template
class
DepthwiseConvKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/central-arm-func/batchnorm_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
BatchnormCompute
(
const
BatchNormParam
&
param
)
{
const
Tensor
*
input_x
=
param
.
InputX
();
auto
input_x_ptr
=
input_x
->
data
<
float
>
();
const
auto
&
x_dims
=
input_x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
H
=
x_dims
[
2
];
const
int
W
=
x_dims
[
3
];
const
int
stride0
=
C
*
H
*
W
;
const
int
stride1
=
H
*
W
;
const
int
stride2
=
W
;
Tensor
*
out
=
param
.
OutputY
();
auto
out_ptr
=
out
->
mutable_data
<
float
>
();
const
float
epsilon
=
param
.
Epsilon
();
const
Tensor
*
mean
=
param
.
InputMean
();
const
Tensor
*
variance
=
param
.
InputVariance
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE
(
C
==
variance
->
numel
(),
"C must equal to variance.numel()"
);
int
HXW
=
H
*
W
;
if
(
HXW
>
32
)
{
int
NXC
=
N
*
C
;
float
*
inv_std_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_scale_ptr
=
new
float
[
NXC
*
4
];
float
*
volatile
new_bias_ptr
=
new
float
[
NXC
*
4
];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for
(
int
i
=
0
;
i
<
C
*
4
;
i
+=
4
)
{
int
index
=
i
/
4
;
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
index
]
+
epsilon
),
0.5
));
inv_std_ptr
[
i
+
1
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
2
]
=
inv_std_ptr
[
i
];
inv_std_ptr
[
i
+
3
]
=
inv_std_ptr
[
i
];
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_scale_ptr
[
i
+
1
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
2
]
=
new_scale_ptr
[
i
];
new_scale_ptr
[
i
+
3
]
=
new_scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
index
]
-
mean_ptr
[
index
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
index
];
new_bias_ptr
[
i
+
1
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
2
]
=
new_bias_ptr
[
i
];
new_bias_ptr
[
i
+
3
]
=
new_bias_ptr
[
i
];
}
for
(
int
j
=
C
*
4
;
j
<
NXC
*
4
;
++
j
)
{
new_scale_ptr
[
j
]
=
new_scale_ptr
[
j
-
C
*
4
];
new_bias_ptr
[
j
]
=
new_bias_ptr
[
j
-
C
*
4
];
}
asm
volatile
(
"subs %[N], %[N], #1
\n\t
"
"blt end_n_%=
\n\t
"
"loop_n_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"blt end_c_%=
\n\t
"
"loop_c_%=:
\n\t
"
"vld1.32 {q9}, [%[new_scale_ptr]]!
\n\t
"
"vld1.32 {q10}, [%[new_bias_ptr]]!
\n\t
"
"mov r6, %[HXW]
\n\t
"
"subs r6, r6, #32
\n\t
"
"blt end_hw_%=
\n\t
"
"loop_hw_%=:
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"subs r6, r6, #32
\n\t
"
"bge loop_hw_%=
\n\t
"
"end_hw_%=:
\n\t
"
"cmp r6, #0
\n\t
"
"bge end_remainder_%=
\n\t
"
"mov r5, #4
\n\t
"
"mul r6, r6, r5
\n\t
"
"add %[input_x_ptr], %[input_x_ptr], r6
\n\t
"
"vld1.32 {q1, q2}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q3, q4}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q5, q6}, [%[input_x_ptr]]!
\n\t
"
"vld1.32 {q7, q8}, [%[input_x_ptr]]!
\n\t
"
"vmul.f32 q1, q1, q9
\n\t
"
"vmul.f32 q2, q2, q9
\n\t
"
"vmul.f32 q3, q3, q9
\n\t
"
"vmul.f32 q4, q4, q9
\n\t
"
"vmul.f32 q5, q5, q9
\n\t
"
"vmul.f32 q6, q6, q9
\n\t
"
"vmul.f32 q7, q7, q9
\n\t
"
"vmul.f32 q8, q8, q9
\n\t
"
"vadd.f32 q1, q1, q10
\n\t
"
"vadd.f32 q2, q2, q10
\n\t
"
"vadd.f32 q3, q3, q10
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vadd.f32 q5, q5, q10
\n\t
"
"vadd.f32 q6, q6, q10
\n\t
"
"vadd.f32 q7, q7, q10
\n\t
"
"vadd.f32 q8, q8, q10
\n\t
"
"add %[out_ptr], %[out_ptr], r6
\n\t
"
"vst1.32 {q1, q2}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q3, q4}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q5, q6}, [%[out_ptr]]!
\n\t
"
"vst1.32 {q7, q8}, [%[out_ptr]]!
\n\t
"
"end_remainder_%=:
\n\t
"
"subs %[C], %[C], #1
\n\t
"
"bge loop_c_%=
\n\t
"
"end_c_%=:
\n\t
"
"subs %[N], %[N], #1
\n\t
"
"bge loop_n_%=
\n\t
"
"end_n_%=:
\n\t
"
:
:
[
input_x_ptr
]
"r"
(
input_x_ptr
),
[
out_ptr
]
"r"
(
out_ptr
),
[
new_scale_ptr
]
"r"
(
new_scale_ptr
),
[
new_bias_ptr
]
"r"
(
new_bias_ptr
),
[
N
]
"r"
(
N
),
[
C
]
"r"
(
C
),
[
HXW
]
"r"
(
HXW
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"r5"
,
"r6"
);
delete
[]
inv_std_ptr
;
delete
[]
new_scale_ptr
;
delete
[]
new_bias_ptr
;
}
else
{
float
*
inv_std_ptr
=
new
float
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
Tensor
new_scale
;
auto
new_scale_ptr
=
new_scale
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
C
}));
Tensor
new_bias
;
auto
new_bias_ptr
=
new_bias
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
C
}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
int
tmp_index
=
n
*
stride0
+
i
*
stride1
+
h
*
stride2
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
int
index
=
tmp_index
+
w
;
out_ptr
[
index
]
=
input_x_ptr
[
index
]
*
new_scale_ptr
[
i
]
+
new_bias_ptr
[
i
];
}
}
}
}
}
delete
[]
inv_std_ptr
;
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddReluCompute
(
const
FusionConvAddReluParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
math
::
expand_bias
(
bias
,
axis
,
output
->
dims
());
output
->
ShareDataWith
(
bias
);
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/conv_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvCompute
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEPTHWISECONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
DepthwiseConvCompute
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
// DLOG << " compute end get Attrs " << strides[0];
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
matmul
<
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_kernel.h
浏览文件 @
dd0cf789
...
@@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
...
@@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
bool
Init
(
const
ConvParam
&
para
)
const
;
bool
Init
(
const
ConvParam
&
para
)
const
;
};
};
inline
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
)
{
bool
filter_1
=
true
,
strides_1
=
true
,
padding_0
=
true
,
dilation_1
=
true
;
for
(
size_t
j
=
0
;
j
<
strides
.
size
();
++
j
)
{
filter_1
=
filter_1
&&
(
static_cast
<
int
>
(
filter_dim
[
j
+
2
])
==
1
);
strides_1
=
strides_1
&&
(
strides
[
j
]
==
1
);
padding_0
=
padding_0
&&
(
paddings
[
j
]
==
0
);
dilation_1
=
dilation_1
&&
(
dilations
[
j
]
==
1
);
}
return
!
(
filter_1
&&
strides_1
&&
padding_0
&&
dilation_1
);
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
ACL_Android
@
591027fc
Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
src/operators/kernel/mali/acl_operator.cc
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if USE_ACL == 1
#include "acl_operator.h"
unsigned
int
bypass_acl_class_layer
=
(
0
|
FLAGS_ENABLE_ACL_CONCAT
|
/*0xffffffff |*/
/*FLAGS_ENABLE_ACL_FC |*/
/*FLAGS_ENABLE_ACL_LRN
|*/
0
);
int
enable_schedule
=
0
;
#ifdef USE_PROFILING
#include "arm_neon.h"
unsigned
int
acl_log_flags
=
(
0
|
MASK_LOG_APP_TIME
|
/*MASK_LOG_ALLOCATE | */
/*MASK_LOG_ALLOCATE | */
/*MASK_LOG_RUN | */
/*MASK_LOG_CONFIG | */
/*MASK_LOG_COPY | */
MASK_LOG_ABSVAL
|
MASK_LOG_BNLL
|
MASK_LOG_CONV
|
MASK_LOG_FC
|
MASK_LOG_LRN
|
MASK_LOG_POOLING
|
MASK_LOG_RELU
|
MASK_LOG_SIGMOID
|
MASK_LOG_SOFTMAX
|
MASK_LOG_TANH
|
MASK_LOG_LC
|
MASK_LOG_BN
|
MASK_LOG_CONCAT
|
0
);
#include <stdio.h>
/* printf */
#include <stdlib.h>
/* getenv */
#endif // USE_PROFILING
static
bool
force_enable_gpu
=
false
;
bool
AclEnableSchedule
(
int
enable
)
{
enable_schedule
=
enable
;
if
(
enable
)
{
force_enable_gpu
=
true
;
}
return
true
;
}
int
isScheduleEnable
()
{
return
enable_schedule
;
}
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
bool
ACLOperator
::
init_gpu_env
=
true
;
#ifdef USE_OPENCL
bool
ACLOperator
::
support_opencl_
=
false
;
bool
opencl_is_available
()
{
return
arm_compute
::
opencl_is_available
();
}
#elif defined(USE_OPENGLES)
bool
ACLOperator
::
support_opengles_
=
false
;
#endif
ACLOperator
::
ACLOperator
(
bool
is_gpu
)
:
operator_state_
(
operator_not_init
),
force_bypass_acl_path_
(
false
),
target_hint_
(
TargetHint
::
DONT_CARE
),
convolution_method_hint_
(
ConvolutionMethodHint
::
GEMM
),
_group
(
1
),
name_
(
""
),
input_idx_
(
0
),
output_idx_
(
0
),
is_gpu_
(
is_gpu
)
{
const
char
*
pBypassACL
;
if
(
init_gpu_env
)
{
#ifdef USE_OPENCL
try
{
if
(
opencl_is_available
())
{
arm_compute
::
CLScheduler
::
get
().
default_init
();
support_opencl_
=
true
;
}
}
catch
(
std
::
exception
&
e
)
{
support_opencl_
=
false
;
}
#elif defined(USE_OPENGLES)
try
{
arm_compute
::
GCScheduler
::
get
().
default_init
();
support_opengles_
=
true
;
}
catch
(
std
::
exception
&
e
)
{
support_opengles_
=
false
;
}
#endif
init_gpu_env
=
false
;
}
if
(
force_enable_gpu
)
is_gpu_
=
true
;
pBypassACL
=
getenv
(
"BYPASSACL"
);
if
(
pBypassACL
)
{
unsigned
int
bacl
;
sscanf
(
pBypassACL
,
"%i"
,
&
bacl
);
if
(
bacl
!=
bypass_acl_class_layer
)
{
bypass_acl_class_layer
=
bacl
;
printf
(
"BYPASSACL<%s>
\n
"
,
pBypassACL
);
printf
(
"BYPASSACL: %x
\n
"
,
bypass_acl_class_layer
);
}
}
#ifdef USE_PROFILING
const
char
*
pLogACL
;
pLogACL
=
getenv
(
"LOGACL"
);
if
(
pLogACL
)
{
unsigned
int
alf
;
sscanf
(
pLogACL
,
"%i"
,
&
alf
);
if
(
alf
!=
acl_log_flags
)
{
acl_log_flags
=
alf
;
printf
(
"LOGACL<%s>
\n
"
,
pLogACL
);
printf
(
"LOGACL: %x
\n
"
,
acl_log_flags
);
}
}
#endif // USE_PROFILING
const
char
*
pEnableSchedule
;
pEnableSchedule
=
getenv
(
"ENABLESCHEDULE"
);
if
(
pEnableSchedule
)
{
int
bshedule
;
sscanf
(
pEnableSchedule
,
"%i"
,
&
bshedule
);
if
(
bshedule
!=
enable_schedule
)
{
enable_schedule
=
bshedule
;
printf
(
"ENABLESCHEDULE<%s>
\n
"
,
pEnableSchedule
);
printf
(
"ENABLESCHEDULE: %x
\n
"
,
enable_schedule
);
}
if
(
enable_schedule
)
{
AclEnableSchedule
(
1
);
}
}
}
ACLOperator
::~
ACLOperator
()
{}
bool
ACLOperator
::
new_tensor
(
std
::
unique_ptr
<
ACLTensor
>&
tensor
,
arm_compute
::
TensorShape
&
shape
,
void
*
mem
,
bool
commit
)
{
auto
acl_tensor
=
new
ACLTensor
(
arm_compute
::
TensorInfo
(
shape
,
arm_compute
::
Format
::
F32
));
acl_tensor
->
set_target
(
getTargetHint
());
acl_tensor
->
bindmem
(
mem
);
if
(
commit
)
acl_tensor
->
commit
();
tensor
=
(
std
::
unique_ptr
<
ACLTensor
>
)
std
::
move
(
acl_tensor
);
return
true
;
}
bool
ACLOperator
::
new_tensor
(
std
::
unique_ptr
<
ACLSubTensor
>&
tensor
,
std
::
unique_ptr
<
ACLTensor
>&
parent
,
arm_compute
::
TensorShape
&
shape
,
arm_compute
::
Coordinates
&
coord
)
{
auto
acl_tensor
=
new
ACLSubTensor
(
parent
,
shape
,
coord
);
acl_tensor
->
set_target
(
getTargetHint
());
tensor
=
(
std
::
unique_ptr
<
ACLSubTensor
>
)
std
::
move
(
acl_tensor
);
return
true
;
}
void
ACLTensor
::
commit
(
TensorType
type
)
{
settensortype
(
type
);
if
(
mem_
)
{
if
(
!
allocate_
)
{
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_ALLOCATE_INFO
);
#endif // USE_PROFILING
allocate
();
allocate_
=
true
;
}
if
(
type_
!=
tensor_output
)
{
tensor_copy
(
mem_
);
}
mem_
=
nullptr
;
}
}
int
BaseACLTensor
::
tensor_copy
(
arm_compute
::
ITensor
*
tensor
,
void
*
mem
,
bool
toTensor
)
{
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_COPY_INFO
);
#endif // USE_PROFILING
arm_compute
::
Window
window
;
// Iterate through the rows (not each element)
window
.
use_tensor_dimensions
(
tensor
->
info
()
->
tensor_shape
(),
/* first_dimension =*/
arm_compute
::
Window
::
DimY
);
int
width
=
tensor
->
info
()
->
tensor_shape
()[
0
];
int
height
=
tensor
->
info
()
->
tensor_shape
()[
1
];
int
deepth
=
tensor
->
info
()
->
tensor_shape
()[
2
];
map
();
// Create an iterator:
arm_compute
::
Iterator
it
(
tensor
,
window
);
// Except it works for an arbitrary number of dimensions
if
(
toTensor
)
{
// mem->tensor
arm_compute
::
execute_window_loop
(
window
,
[
&
](
const
arm_compute
::
Coordinates
&
id
)
{
memcpy
(
it
.
ptr
(),
((
char
*
)
mem
)
+
((
id
[
3
]
*
(
width
*
height
*
deepth
)
+
id
.
z
()
*
(
width
*
height
)
+
id
.
y
()
*
width
+
id
.
x
())
*
tensor
->
info
()
->
element_size
()),
width
*
tensor
->
info
()
->
element_size
());
},
it
);
}
else
{
// tensor-->mem
arm_compute
::
execute_window_loop
(
window
,
[
&
](
const
arm_compute
::
Coordinates
&
id
)
{
memcpy
(((
char
*
)
mem
)
+
((
id
[
3
]
*
(
width
*
height
*
deepth
)
+
id
.
z
()
*
(
width
*
height
)
+
id
.
y
()
*
width
)
*
tensor
->
info
()
->
element_size
()),
it
.
ptr
(),
width
*
tensor
->
info
()
->
element_size
());
},
it
);
}
unmap
();
return
0
;
}
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/acl_operator.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_OPERATOR_H_
#define ACL_OPERATOR_H_
#include <framework/tensor.h>
#include <operators/op_param.h>
#if USE_ACL == 1
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
#include "arm_compute/runtime/Tensor.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
#endif
#ifdef USE_OPENGLES
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
#endif
#include "acl_tensor.h"
#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
#define FLAGS_ENABLE_ACL_BNLL 0x00000002
#define FLAGS_ENABLE_ACL_CONV 0x00000004
#define FLAGS_ENABLE_ACL_FC 0x00000008
#define FLAGS_ENABLE_ACL_LRN 0x00000010
#define FLAGS_ENABLE_ACL_POOLING 0x00000020
#define FLAGS_ENABLE_ACL_RELU 0x00000040
#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
#define FLAGS_ENABLE_ACL_TANH 0x00000200
#define FLAGS_ENABLE_ACL_LC 0x00000400
#define FLAGS_ENABLE_ACL_BN 0x00000800
#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
extern
unsigned
int
bypass_acl_class_layer
;
#ifdef USE_PROFILING
#include <sys/time.h>
#define NANO_SEC_CONV 1000000
#define MASK_LOG_APP_TIME 0x00000001
#define MASK_LOG_ALLOCATE 0x00000002
#define MASK_LOG_RUN 0x00000004
#define MASK_LOG_CONFIG 0x00000008
#define MASK_LOG_COPY 0x00000010
#define MASK_LOG_ABSVAL 0x00000020
#define MASK_LOG_BNLL 0x00000040
#define MASK_LOG_CONV 0x00000080
#define MASK_LOG_FC 0x00000100
#define MASK_LOG_LRN 0x00000200
#define MASK_LOG_POOLING 0x00000400
#define MASK_LOG_RELU 0x00000800
#define MASK_LOG_SIGMOID 0x00001000
#define MASK_LOG_SOFTMAX 0x00002000
#define MASK_LOG_TANH 0x00004000
#define MASK_LOG_LC 0x00008000
#define MASK_LOG_BN 0x00010000
#define MASK_LOG_CONCAT 0x00020000
#define APP_TIME_INFO MASK_LOG_APP_TIME, "time: \t"
#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate: \t\t"
#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t"
#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t"
#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t"
#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t"
#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t"
#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t"
#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_SIGMOID_INFO \
MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_SOFTMAX_INFO \
MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_TANH_INFO \
MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_BN_INFO \
MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_CONCAT_INFO \
MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
extern
unsigned
int
acl_log_flags
;
class
logtime_util
{
public:
logtime_util
()
{
mask
=
0
;
}
logtime_util
(
int
mask_
,
const
char
*
information_
)
{
setlogtime_info
(
mask_
,
information_
);
}
void
setlogtime_info
(
int
mask_
,
const
char
*
information_
)
{
mask
=
mask_
;
if
(
acl_log_flags
&
mask
)
{
strncpy
(
information
,
information_
,
255
);
gettimeofday
(
&
tv
[
0
],
NULL
);
}
}
~
logtime_util
()
{
if
(
acl_log_flags
&
mask
)
{
int
time
[
2
];
gettimeofday
(
&
tv
[
1
],
NULL
);
time
[
0
]
=
tv
[
0
].
tv_sec
*
NANO_SEC_CONV
+
tv
[
0
].
tv_usec
;
time
[
1
]
=
tv
[
1
].
tv_sec
*
NANO_SEC_CONV
+
tv
[
1
].
tv_usec
;
printf
(
"%s %.6lf
\n
"
,
information
,
(((
double
)
time
[
1
]
-
time
[
0
])
/
NANO_SEC_CONV
));
}
}
void
log_time
(
bool
start
)
{
if
(
acl_log_flags
&
mask
)
{
if
(
start
)
{
gettimeofday
(
&
tv
[
0
],
NULL
);
}
else
{
int
time
[
2
];
gettimeofday
(
&
tv
[
1
],
NULL
);
time
[
0
]
=
tv
[
0
].
tv_sec
*
NANO_SEC_CONV
+
tv
[
0
].
tv_usec
;
time
[
1
]
=
tv
[
1
].
tv_sec
*
NANO_SEC_CONV
+
tv
[
1
].
tv_usec
;
printf
(
"%s %.6lf
\n
"
,
information
,
(((
double
)
time
[
1
]
-
time
[
0
])
/
NANO_SEC_CONV
));
}
}
}
private:
struct
timeval
tv
[
2
];
int
mask
;
char
information
[
256
];
};
#endif // USE_PROFILING
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
class
AclParameters
{
public:
AclParameters
()
{
dilated
=
false
;
dim
=
2
;
num_group
=
1
;
}
int
batch
;
int
in_depth
;
int
in_rows
;
int
in_cols
;
int
out_depth
;
int
out_rows
;
int
out_cols
;
int
out_num
;
int
filter_rows
;
int
filter_cols
;
int
stride_rows
;
int
stride_cols
;
int
pad_rows
;
int
pad_cols
;
int
dilation_rows
;
int
dilation_cols
;
int
num_group
;
bool
dilated
;
int
dim
;
int
epsilon
;
int
nsize
;
float
alpha
;
float
beta
;
float
knorm
;
void
*
input_data
;
void
*
output_data
;
void
*
weight_data
;
void
*
biases_data
;
void
*
mean_data
;
void
*
var_data
;
std
::
string
pool_type
;
std
::
string
act_type
;
std
::
string
data_layout
;
bool
is_global_pool
;
bool
is_channel_concat
;
std
::
vector
<
framework
::
LoDTensor
*>
in_tensor
;
};
enum
TensorType
{
tensor_input
,
tensor_output
,
tensor_weights
,
tensor_biases
,
tensor_mean
,
tensor_var
,
tensor_beta
,
tensor_gamma
,
tensor_concat
,
tensor_data
,
};
enum
OperatorState
{
operator_not_init
,
operator_init_done
,
operator_reinit
,
};
enum
OperateType
{
operate_type_pooling
,
operate_type_activation
,
operate_type_lrn
,
operate_type_conv
,
operate_type_lc
,
operate_type_fc
,
operate_type_bn
,
operate_type_softmax
,
operate_type_concat
,
};
class
BaseACLTensor
{
public:
BaseACLTensor
()
:
type_
(
tensor_input
),
allocate_
(
false
)
{}
virtual
~
BaseACLTensor
()
{}
virtual
void
bindmem
(
void
*
mem
)
{
mem_
=
mem
;
}
virtual
void
settensortype
(
TensorType
type
)
{
type_
=
type
;
}
virtual
void
map
(
bool
blocking
=
true
)
{}
virtual
void
unmap
()
{}
virtual
void
commit
(
TensorType
type
=
tensor_data
)
{}
int
tensor_copy
(
arm_compute
::
ITensor
*
tensor
,
void
*
mem
,
bool
toTensor
=
true
);
protected:
void
*
mem_
;
TensorType
type_
;
bool
allocate_
;
};
class
ACLTensor
:
public
BaseACLTensor
,
public
Tensor
{
public:
explicit
ACLTensor
(
arm_compute
::
TensorInfo
&&
info
)
:
Tensor
(
info
)
{}
virtual
void
map
(
bool
blocking
=
true
)
{
if
(
!
allocate_
)
{
Tensor
::
allocate
();
allocate_
=
true
;
}
Tensor
::
map
(
blocking
);
}
virtual
int
tensor_copy
(
void
*
mem
,
bool
toTensor
=
true
)
{
auto
acl_tensor
=
this
;
arm_compute
::
ITensor
*
tensor
=
acl_tensor
->
tensor
();
BaseACLTensor
::
tensor_copy
(
tensor
,
mem
,
toTensor
);
return
0
;
}
virtual
void
unmap
()
{
Tensor
::
unmap
();
}
virtual
void
commit
(
TensorType
type
=
tensor_data
);
};
class
ACLSubTensor
:
public
BaseACLTensor
,
public
SubTensor
{
public:
ACLSubTensor
(
std
::
unique_ptr
<
ACLTensor
>
&
parent
,
arm_compute
::
TensorShape
&
shape
,
arm_compute
::
Coordinates
&
coord
)
:
SubTensor
(
parent
.
get
(),
shape
,
coord
)
{}
virtual
int
tensor_copy
(
void
*
mem
,
bool
toTensor
=
true
)
{
return
0
;
}
};
template
<
typename
T
>
class
TensorPair
{
public:
TensorPair
()
{}
~
TensorPair
()
{}
TensorType
type
;
std
::
unique_ptr
<
T
>
tensor
;
};
template
<
typename
T
>
std
::
unique_ptr
<
T
>
&
tensor_item
(
std
::
vector
<
std
::
unique_ptr
<
TensorPair
<
T
>>>
&
pool
,
TensorType
type
,
int
idx
)
{
int
count
=
0
;
for
(
auto
&
item
:
pool
)
{
if
(
item
.
get
()
->
type
==
type
)
{
++
count
;
}
if
(
item
.
get
()
->
type
==
type
&&
idx
==
count
-
1
)
{
return
item
.
get
()
->
tensor
;
}
}
pool
.
push_back
((
std
::
unique_ptr
<
TensorPair
<
T
>>
)
std
::
move
(
new
TensorPair
<
T
>
));
auto
item
=
pool
[
pool
.
size
()
-
1
].
get
();
item
->
type
=
type
;
item
->
tensor
=
NULL
;
return
item
->
tensor
;
}
class
ACLOperator
{
public:
virtual
void
commit
()
{
for
(
auto
&
item
:
tensor_pool_
)
{
if
(
item
.
get
()
->
tensor
)
item
.
get
()
->
tensor
->
commit
(
item
.
get
()
->
type
);
}
}
inline
void
run
()
{
commit
();
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_RUN_INFO
);
#endif // USE_PROFILING
for
(
auto
&
c
:
funcs_
)
{
c
->
run
();
}
}
inline
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
funcs
()
{
return
funcs_
;
}
inline
std
::
unique_ptr
<
ACLSubTensor
>
&
sinput
(
int
idx
=
0
)
{
return
tensor_item
(
subtensor_pool_
,
tensor_input
,
idx
);
}
inline
std
::
unique_ptr
<
ACLSubTensor
>
&
soutput
(
int
idx
=
0
)
{
return
tensor_item
(
subtensor_pool_
,
tensor_output
,
idx
);
}
inline
std
::
unique_ptr
<
ACLSubTensor
>
&
sweights
(
int
idx
=
0
)
{
return
tensor_item
(
subtensor_pool_
,
tensor_weights
,
idx
);
}
inline
std
::
unique_ptr
<
ACLSubTensor
>
&
sbiases
(
int
idx
=
0
)
{
return
tensor_item
(
subtensor_pool_
,
tensor_biases
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
cinput
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_concat
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
input
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_input
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
output
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_output
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
weights
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_weights
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
biases
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_biases
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
mean
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_mean
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
var
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_var
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
beta
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_beta
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
gamma
(
int
idx
=
0
)
{
return
tensor_item
(
tensor_pool_
,
tensor_gamma
,
idx
);
}
inline
std
::
unique_ptr
<
ACLTensor
>
&
tensor
(
TensorType
type
)
{
switch
(
type
)
{
case
tensor_biases
:
return
biases
();
break
;
case
tensor_weights
:
return
weights
();
break
;
case
tensor_output
:
return
output
();
break
;
default:
case
tensor_input
:
return
input
();
break
;
}
return
input
();
}
explicit
ACLOperator
(
bool
is_gpu
=
false
);
virtual
~
ACLOperator
();
inline
TargetHint
getTargetHint
()
{
#ifdef USE_OPENCL
if
(
target_hint_
==
TargetHint
::
DONT_CARE
)
{
if
(
is_gpu_
)
{
return
TargetHint
::
OPENCL
;
}
return
TargetHint
::
NEON
;
}
return
target_hint_
;
#elif defined(USE_OPENGLES)
if
(
target_hint_
==
TargetHint
::
DONT_CARE
)
{
if
(
is_gpu_
)
{
return
TargetHint
::
OPENGLES
;
}
return
TargetHint
::
NEON
;
}
return
target_hint_
;
#else
return
TargetHint
::
NEON
;
#endif
}
inline
void
setTargetHint
(
TargetHint
hint
)
{
target_hint_
=
hint
;
}
inline
ConvolutionMethodHint
&
getConvMethod
()
{
return
convolution_method_hint_
;
}
inline
void
setConvMethod
()
{
convolution_method_hint_
=
ConvolutionMethodHint
::
DIRECT
;
}
inline
bool
tensor_mem
(
std
::
unique_ptr
<
ACLTensor
>
&
tensor
,
void
*
mem
)
{
tensor
->
bindmem
(
mem
);
return
true
;
}
inline
bool
tensor_mem
(
void
*
mem
,
std
::
unique_ptr
<
ACLTensor
>
&
tensor
)
{
tensor
->
tensor_copy
(
mem
,
false
);
return
true
;
}
bool
new_tensor
(
std
::
unique_ptr
<
ACLTensor
>
&
tensor
,
arm_compute
::
TensorShape
&
shape
,
void
*
mem
=
nullptr
,
bool
commit
=
false
);
bool
new_tensor
(
std
::
unique_ptr
<
ACLSubTensor
>
&
tensor
,
std
::
unique_ptr
<
ACLTensor
>
&
parent
,
arm_compute
::
TensorShape
&
shape
,
arm_compute
::
Coordinates
&
coord
);
inline
int
&
group
()
{
return
_group
;
}
inline
void
set_operator_property
(
OperateType
type
,
const
char
*
name
)
{
name_
=
name
;
type_
=
type
;
}
inline
void
acl_run
(
void
*
input_data
,
void
*
output_data
)
{
if
(
input_data
)
tensor_mem
(
input
(),
input_data
);
run
();
tensor_mem
(
output_data
,
output
());
}
inline
int
&
input_idx
()
{
return
input_idx_
;
}
inline
int
&
output_idx
()
{
return
output_idx_
;
}
protected:
inline
bool
isGPUMode
()
{
#ifdef USE_OPENCL
if
(
!
support_opencl_
)
return
false
;
return
getTargetHint
()
==
TargetHint
::
OPENCL
;
#elif defined(USE_OPENGLES)
if
(
!
support_opengles_
)
return
false
;
return
getTargetHint
()
==
TargetHint
::
OPENGLES
;
#endif
return
false
;
}
inline
OperatorState
&
opstate
()
{
return
operator_state_
;
}
inline
bool
is_operator_init_done
(
arm_compute
::
TensorShape
shape
,
TensorType
type
=
tensor_input
)
{
checkreshape
(
shape
,
type
);
return
operator_state_
==
operator_init_done
;
}
inline
void
set_operator_init_done
()
{
opstate
()
=
operator_init_done
;
set_bypass_state
(
false
);
}
inline
void
set_bypass_state
(
bool
state
=
false
)
{
force_bypass_acl_path_
=
state
;
}
inline
OperatorState
checkreshape
(
arm_compute
::
TensorShape
shape
,
TensorType
type
=
tensor_input
)
{
opstate
()
=
reshape
(
shape
,
type
);
if
(
opstate
()
==
operator_reinit
)
{
freeres
();
}
return
opstate
();
}
inline
OperatorState
reshape
(
arm_compute
::
TensorShape
&
shape
,
TensorType
type
)
{
arm_compute
::
TensorShape
_shape
;
std
::
unique_ptr
<
ACLTensor
>
&
acl_tensor
=
tensor
(
type
);
if
(
!
acl_tensor
.
get
())
return
operator_not_init
;
_shape
=
acl_tensor
->
info
().
tensor_shape
();
if
(
_shape
.
total_size
()
==
shape
.
total_size
()
&&
_shape
[
0
]
==
shape
[
0
]
&&
_shape
[
1
]
==
shape
[
1
])
{
return
operator_init_done
;
}
return
operator_reinit
;
}
inline
void
freeres
()
{
tensor_pool_
.
clear
();
subtensor_pool_
.
clear
();
funcs_
.
clear
();
}
inline
const
char
*&
name
()
{
return
name_
;
}
inline
void
set_in_out_index
(
int
indata_idx
,
int
outdata_idx
)
{
input_idx
()
=
indata_idx
;
output_idx
()
=
outdata_idx
;
}
protected:
std
::
vector
<
std
::
unique_ptr
<
TensorPair
<
ACLTensor
>>>
tensor_pool_
;
std
::
vector
<
std
::
unique_ptr
<
TensorPair
<
ACLSubTensor
>>>
subtensor_pool_
;
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
funcs_
;
OperatorState
operator_state_
;
bool
force_bypass_acl_path_
;
TargetHint
target_hint_
;
ConvolutionMethodHint
convolution_method_hint_
;
static
bool
support_opengles_
;
static
bool
support_opencl_
;
static
bool
init_gpu_env
;
int
_group
;
const
char
*
name_
;
OperateType
type_
;
int
input_idx_
,
output_idx_
;
bool
is_gpu_
;
};
int
isScheduleEnable
();
template
<
typename
OperatorType
,
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_function
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
output
)
{
auto
op
=
cpp14
::
make_unique
<
OperatorType
>
();
op
->
configure
(
dynamic_cast
<
TensorType
*>
(
input
),
dynamic_cast
<
TensorType
*>
(
output
));
return
std
::
move
(
op
);
}
template
<
typename
OperatorType
,
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
output
)
{
return
instantiate_function
<
OperatorType
,
TensorType
>
(
input
,
output
);
}
template
<
typename
OpType
,
typename
OpTensor
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_op_func
(
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
&
hint
)
{
std
::
unique_ptr
<
arm_compute
::
IFunction
>
func
;
func
=
instantiate
<
OpType
,
OpTensor
>
(
input
->
tensor
(),
output
->
tensor
());
return
func
;
}
template
<
typename
OperatorType
,
typename
TensorType
,
typename
VectorTensor
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_function
(
VectorTensor
inputs
,
arm_compute
::
ITensor
*
output
)
{
auto
op
=
cpp14
::
make_unique
<
OperatorType
>
();
op
->
configure
(
inputs
,
dynamic_cast
<
TensorType
*>
(
output
));
return
std
::
move
(
op
);
}
template
<
typename
OperatorType
,
typename
TensorType
,
typename
VectorTensor
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate
(
VectorTensor
inputs
,
arm_compute
::
ITensor
*
output
)
{
return
instantiate_function
<
OperatorType
,
TensorType
,
VectorTensor
>
(
inputs
,
output
);
}
template
<
typename
OpType
,
typename
OpTensor
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_op_func_lists
(
ACLOperator
*&
acl_op
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
int
num
,
TargetHint
&
hint
)
{
std
::
unique_ptr
<
arm_compute
::
IFunction
>
func
;
static
std
::
vector
<
OpTensor
*>
tensors
;
tensors
.
clear
();
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
tensors
.
push_back
(
dynamic_cast
<
OpTensor
*>
(
acl_op
->
cinput
(
i
).
get
()
->
tensor
()));
}
func
=
instantiate
<
OpType
,
OpTensor
,
std
::
vector
<
OpTensor
*>>
(
tensors
,
output
->
tensor
());
return
func
;
}
template
<
typename
OperatorType
,
typename
TensorType
,
typename
OperatorInfo
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_function
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
output
,
const
OperatorInfo
&
info
)
{
auto
op
=
cpp14
::
make_unique
<
OperatorType
>
();
op
->
configure
(
dynamic_cast
<
TensorType
*>
(
input
),
dynamic_cast
<
TensorType
*>
(
output
),
info
);
return
std
::
move
(
op
);
}
template
<
typename
OperatorType
,
typename
TensorType
,
typename
OperatorInfo
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
output
,
const
OperatorInfo
&
info
)
{
return
instantiate_function
<
OperatorType
,
TensorType
,
OperatorInfo
>
(
input
,
output
,
info
);
}
template
<
typename
OpType
,
typename
OpTensor
,
typename
OperatorInfo
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_op_func
(
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
const
OperatorInfo
&
info
,
TargetHint
&
hint
)
{
std
::
unique_ptr
<
arm_compute
::
IFunction
>
func
;
func
=
instantiate
<
OpType
,
OpTensor
,
OperatorInfo
>
(
input
->
tensor
(),
output
->
tensor
(),
info
);
return
func
;
}
template
<
typename
OperatorType
,
typename
TensorType
,
typename
OperatorInfo
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_function
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
weights
,
arm_compute
::
ITensor
*
biases
,
arm_compute
::
ITensor
*
output
,
const
OperatorInfo
&
info
)
{
auto
op
=
cpp14
::
make_unique
<
OperatorType
>
();
op
->
configure
(
dynamic_cast
<
TensorType
*>
(
input
),
dynamic_cast
<
TensorType
*>
(
weights
),
dynamic_cast
<
TensorType
*>
(
biases
),
dynamic_cast
<
TensorType
*>
(
output
),
info
);
return
std
::
move
(
op
);
}
template
<
typename
OperatorType
,
typename
TensorType
,
typename
OperatorInfo
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
weights
,
arm_compute
::
ITensor
*
biases
,
arm_compute
::
ITensor
*
output
,
const
OperatorInfo
&
info
)
{
return
instantiate_function
<
OperatorType
,
TensorType
,
OperatorInfo
>
(
input
,
weights
,
biases
,
output
,
info
);
}
template
<
typename
OpType
,
typename
OpTensor
,
typename
OperatorInfo
,
typename
ACLTensor
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_op_func
(
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
weights
,
std
::
unique_ptr
<
ACLTensor
>
&
biases
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
const
OperatorInfo
&
info
,
TargetHint
&
hint
)
{
std
::
unique_ptr
<
arm_compute
::
IFunction
>
func
;
arm_compute
::
ITensor
*
biases_tensor
=
NULL
;
if
(
biases
.
get
())
{
biases_tensor
=
biases
->
tensor
();
}
func
=
instantiate
<
OpType
,
OpTensor
,
OperatorInfo
>
(
input
->
tensor
(),
weights
->
tensor
(),
biases_tensor
,
output
->
tensor
(),
info
);
return
func
;
}
template
<
typename
Dtype
,
typename
OperatorType
,
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_function
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
output
,
arm_compute
::
ITensor
*
mean
,
arm_compute
::
ITensor
*
var
,
arm_compute
::
ITensor
*
beta
,
arm_compute
::
ITensor
*
gamma
,
Dtype
&
eps
)
{
auto
op
=
cpp14
::
make_unique
<
OperatorType
>
();
op
->
configure
(
dynamic_cast
<
TensorType
*>
(
input
),
dynamic_cast
<
TensorType
*>
(
output
),
dynamic_cast
<
TensorType
*>
(
mean
),
dynamic_cast
<
TensorType
*>
(
var
),
dynamic_cast
<
TensorType
*>
(
beta
),
dynamic_cast
<
TensorType
*>
(
gamma
),
eps
);
return
std
::
move
(
op
);
}
template
<
typename
Dtype
,
typename
OperatorType
,
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate
(
arm_compute
::
ITensor
*
input
,
arm_compute
::
ITensor
*
output
,
arm_compute
::
ITensor
*
mean
,
arm_compute
::
ITensor
*
var
,
arm_compute
::
ITensor
*
beta
,
arm_compute
::
ITensor
*
gamma
,
Dtype
eps
)
{
return
instantiate_function
<
Dtype
,
OperatorType
,
TensorType
>
(
input
,
output
,
mean
,
var
,
beta
,
gamma
,
eps
);
}
template
<
typename
Dtype
,
typename
OpType
,
typename
OpTensor
>
std
::
unique_ptr
<
arm_compute
::
IFunction
>
instantiate_op_func
(
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
std
::
unique_ptr
<
ACLTensor
>
&
mean
,
std
::
unique_ptr
<
ACLTensor
>
&
var
,
std
::
unique_ptr
<
ACLTensor
>
&
beta
,
std
::
unique_ptr
<
ACLTensor
>
&
gamma
,
Dtype
eps
,
TargetHint
hint
)
{
std
::
unique_ptr
<
arm_compute
::
IFunction
>
func
;
func
=
instantiate
<
Dtype
,
OpType
,
OpTensor
>
(
input
->
tensor
(),
output
->
tensor
(),
mean
->
tensor
(),
var
->
tensor
(),
beta
->
tensor
(),
gamma
->
tensor
(),
eps
);
return
func
;
}
template
<
typename
OperatorInfo
>
bool
instantiate_op_pooling
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
const
OperatorInfo
&
info
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLPoolingLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
PoolingLayerInfo
>
(
input
,
output
,
info
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCPoolingLayer
,
arm_compute
::
IGCTensor
,
arm_compute
::
PoolingLayerInfo
>
(
input
,
output
,
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEPoolingLayer
,
arm_compute
::
ITensor
,
arm_compute
::
PoolingLayerInfo
>
(
input
,
output
,
info
,
hint
));
}
return
true
;
}
template
<
typename
OperatorInfo
>
bool
instantiate_op_activation
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
const
OperatorInfo
&
info
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLActivationLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
ActivationLayerInfo
>
(
input
,
output
,
info
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCActivationLayer
,
arm_compute
::
IGCTensor
,
arm_compute
::
ActivationLayerInfo
>
(
input
,
output
,
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEActivationLayer
,
arm_compute
::
ITensor
,
arm_compute
::
ActivationLayerInfo
>
(
input
,
output
,
info
,
hint
));
}
return
true
;
}
template
<
typename
OperatorInfo
>
bool
instantiate_op_lrn
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
const
OperatorInfo
&
info
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLNormalizationLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
NormalizationLayerInfo
>
(
input
,
output
,
info
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCNormalizationLayer
,
arm_compute
::
IGCTensor
,
arm_compute
::
NormalizationLayerInfo
>
(
input
,
output
,
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NENormalizationLayer
,
arm_compute
::
ITensor
,
arm_compute
::
NormalizationLayerInfo
>
(
input
,
output
,
info
,
hint
));
}
return
true
;
}
template
<
typename
OperatorInfo
>
bool
instantiate_op_conv
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
const
OperatorInfo
&
info
)
{
std
::
unique_ptr
<
ACLTensor
>
&
weights
=
acl_op
->
weights
();
std
::
unique_ptr
<
ACLTensor
>
&
biases
=
acl_op
->
biases
();
ConvolutionMethodHint
&
conv_method
=
acl_op
->
getConvMethod
();
bool
has_biases
=
biases
.
get
()
?
true
:
false
;
int
&
groups
=
acl_op
->
group
();
arm_compute
::
TensorShape
input_shape
=
input
->
info
().
tensor_shape
();
arm_compute
::
TensorShape
weights_shape
=
weights
->
info
().
tensor_shape
();
arm_compute
::
TensorShape
biases_shape
;
if
(
has_biases
)
{
biases_shape
=
biases
->
info
().
tensor_shape
();
}
arm_compute
::
TensorShape
output_shape
=
output
->
info
().
tensor_shape
();
if
(
groups
==
1
)
{
if
(
conv_method
==
ConvolutionMethodHint
::
GEMM
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLConvolutionLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
PadStrideInfo
>
(
acl_op
->
input
(),
acl_op
->
weights
(),
acl_op
->
biases
(),
acl_op
->
output
(),
info
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCConvolutionLayer
,
arm_compute
::
IGCTensor
,
arm_compute
::
PadStrideInfo
>
(
acl_op
->
input
(),
acl_op
->
weights
(),
acl_op
->
biases
(),
acl_op
->
output
(),
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEConvolutionLayer
,
arm_compute
::
ITensor
,
arm_compute
::
PadStrideInfo
>
(
acl_op
->
input
(),
acl_op
->
weights
(),
acl_op
->
biases
(),
acl_op
->
output
(),
info
,
hint
));
}
}
else
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLDirectConvolutionLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
PadStrideInfo
>
(
acl_op
->
input
(),
acl_op
->
weights
(),
acl_op
->
biases
(),
acl_op
->
output
(),
info
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCDirectConvolutionLayer
,
arm_compute
::
IGCTensor
,
arm_compute
::
PadStrideInfo
>
(
acl_op
->
input
(),
acl_op
->
weights
(),
acl_op
->
biases
(),
acl_op
->
output
(),
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEDirectConvolutionLayer
,
arm_compute
::
ITensor
,
arm_compute
::
PadStrideInfo
>
(
acl_op
->
input
(),
acl_op
->
weights
(),
acl_op
->
biases
(),
acl_op
->
output
(),
info
,
hint
));
}
}
return
true
;
}
// Calculate sub-tensor splits
const
int
input_split
=
input_shape
.
z
()
/
groups
;
const
int
output_split
=
output_shape
.
z
()
/
groups
;
const
int
weights_split
=
weights_shape
[
3
]
/
groups
;
const
int
biases_split
=
biases_shape
.
x
()
/
groups
;
// Calculate sub-tensor shapes
input_shape
.
set
(
2
,
input_split
);
output_shape
.
set
(
2
,
output_split
);
weights_shape
.
set
(
3
,
weights_split
);
biases_shape
.
set
(
0
,
biases_split
);
for
(
auto
i
=
0
;
i
<
groups
;
++
i
)
{
// Calculate sub-tensors starting coordinates
arm_compute
::
Coordinates
input_coord
(
0
,
0
,
input_split
*
i
);
arm_compute
::
Coordinates
output_coord
(
0
,
0
,
output_split
*
i
);
arm_compute
::
Coordinates
weights_coord
(
0
,
0
,
0
,
weights_split
*
i
);
arm_compute
::
Coordinates
biases_coord
(
biases_split
*
i
);
// Create sub-tensors for input, output, weights and bias
acl_op
->
new_tensor
(
acl_op
->
sinput
(
i
),
acl_op
->
input
(),
input_shape
,
input_coord
);
acl_op
->
new_tensor
(
acl_op
->
soutput
(
i
),
acl_op
->
output
(),
output_shape
,
output_coord
);
acl_op
->
new_tensor
(
acl_op
->
sweights
(
i
),
acl_op
->
weights
(),
weights_shape
,
weights_coord
);
if
(
has_biases
)
{
acl_op
->
new_tensor
(
acl_op
->
sbiases
(
i
),
acl_op
->
biases
(),
biases_shape
,
biases_coord
);
}
bool
use_opencl
=
false
;
if
(
conv_method
==
ConvolutionMethodHint
::
GEMM
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
use_opencl
=
true
;
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLConvolutionLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
PadStrideInfo
,
ACLSubTensor
>
(
acl_op
->
sinput
(
i
),
acl_op
->
sweights
(
i
),
acl_op
->
sbiases
(
i
),
acl_op
->
soutput
(
i
),
info
,
hint
));
}
#endif
if
(
!
use_opencl
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEConvolutionLayer
,
arm_compute
::
ITensor
,
arm_compute
::
PadStrideInfo
,
ACLSubTensor
>
(
acl_op
->
sinput
(
i
),
acl_op
->
sweights
(
i
),
acl_op
->
sbiases
(
i
),
acl_op
->
soutput
(
i
),
info
,
hint
));
}
}
else
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
use_opencl
=
true
;
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLDirectConvolutionLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
PadStrideInfo
,
ACLSubTensor
>
(
acl_op
->
sinput
(
i
),
acl_op
->
sweights
(
i
),
acl_op
->
sbiases
(
i
),
acl_op
->
soutput
(
i
),
info
,
hint
));
}
#endif
if
(
!
use_opencl
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEDirectConvolutionLayer
,
arm_compute
::
ITensor
,
arm_compute
::
PadStrideInfo
,
ACLSubTensor
>
(
acl_op
->
sinput
(
i
),
acl_op
->
sweights
(
i
),
acl_op
->
sbiases
(
i
),
acl_op
->
soutput
(
i
),
info
,
hint
));
}
}
}
return
true
;
}
template
<
typename
OperatorInfo
>
bool
instantiate_op_lc
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
const
OperatorInfo
&
info
)
{
std
::
unique_ptr
<
ACLTensor
>
&
weights
=
acl_op
->
weights
();
std
::
unique_ptr
<
ACLTensor
>
&
biases
=
acl_op
->
biases
();
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLLocallyConnectedLayer
,
arm_compute
::
ICLTensor
,
arm_compute
::
PadStrideInfo
>
(
input
,
weights
,
biases
,
output
,
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NELocallyConnectedLayer
,
arm_compute
::
ITensor
,
arm_compute
::
PadStrideInfo
>
(
input
,
weights
,
biases
,
output
,
info
,
hint
));
}
return
true
;
}
template
<
typename
OperatorInfo
>
bool
instantiate_op_fc
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
const
OperatorInfo
&
info
)
{
std
::
unique_ptr
<
ACLTensor
>
&
weights
=
acl_op
->
weights
();
std
::
unique_ptr
<
ACLTensor
>
&
biases
=
acl_op
->
biases
();
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLFullyConnectedLayer
,
arm_compute
::
ICLTensor
,
bool
>
(
input
,
weights
,
biases
,
output
,
info
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCFullyConnectedLayer
,
arm_compute
::
IGCTensor
,
bool
>
(
input
,
weights
,
biases
,
output
,
info
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NEFullyConnectedLayer
,
arm_compute
::
ITensor
,
bool
>
(
input
,
weights
,
biases
,
output
,
info
,
hint
));
}
return
true
;
}
template
<
typename
Dtype
>
bool
instantiate_op_bn
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
Dtype
eps
)
{
std
::
unique_ptr
<
ACLTensor
>
&
mean
=
acl_op
->
mean
();
std
::
unique_ptr
<
ACLTensor
>
&
var
=
acl_op
->
var
();
std
::
unique_ptr
<
ACLTensor
>
&
beta
=
acl_op
->
beta
();
std
::
unique_ptr
<
ACLTensor
>
&
gamma
=
acl_op
->
gamma
();
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
Dtype
,
arm_compute
::
CLBatchNormalizationLayer
,
arm_compute
::
ICLTensor
>
(
input
,
output
,
mean
,
var
,
beta
,
gamma
,
eps
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
Dtype
,
arm_compute
::
GCBatchNormalizationLayer
,
arm_compute
::
IGCTensor
>
(
input
,
output
,
mean
,
var
,
beta
,
gamma
,
eps
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
Dtype
,
arm_compute
::
NEBatchNormalizationLayer
,
arm_compute
::
ITensor
>
(
input
,
output
,
mean
,
var
,
beta
,
gamma
,
eps
,
hint
));
}
return
true
;
}
inline
bool
instantiate_op_softmax
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
void
*
data
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
CLSoftmaxLayer
,
arm_compute
::
ICLTensor
>
(
input
,
output
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
GCSoftmaxLayer
,
arm_compute
::
IGCTensor
>
(
input
,
output
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func
<
arm_compute
::
NESoftmaxLayer
,
arm_compute
::
ITensor
>
(
input
,
output
,
hint
));
}
return
true
;
}
inline
bool
instantiate_op_concat
(
ACLOperator
*
acl_op
,
std
::
vector
<
std
::
unique_ptr
<
arm_compute
::
IFunction
>>
&
func
,
std
::
unique_ptr
<
ACLTensor
>
&
input
,
std
::
unique_ptr
<
ACLTensor
>
&
output
,
TargetHint
hint
,
int
num
)
{
#ifdef USE_OPENCL
if
(
hint
==
TargetHint
::
OPENCL
)
{
func
.
push_back
(
instantiate_op_func_lists
<
arm_compute
::
CLDepthConcatenateLayer
,
arm_compute
::
ICLTensor
>
(
acl_op
,
output
,
num
,
hint
));
return
true
;
}
#elif defined(USE_OPENGLES)
if
(
hint
==
TargetHint
::
OPENGLES
)
{
func
.
push_back
(
instantiate_op_func_lists
<
arm_compute
::
GCDepthConcatenateLayer
,
arm_compute
::
IGCTensor
>
(
acl_op
,
output
,
num
,
hint
));
return
true
;
}
#endif
{
func
.
push_back
(
instantiate_op_func_lists
<
arm_compute
::
NEDepthConcatenateLayer
,
arm_compute
::
ITensor
>
(
acl_op
,
output
,
num
,
hint
));
}
return
true
;
}
template
<
typename
Dtype
>
void
*
InputdataPtr
(
ACLOperator
*
op
,
const
std
::
vector
<
framework
::
LoDTensor
*>
&
input_data
,
Dtype
type
,
int
index
=
-
1
)
{
if
(
index
==
-
1
)
index
=
0
;
return
(
void
*
)(
input_data
[
index
]
->
mutable_data
<
Dtype
>
());
}
template
<
typename
Dtype
>
void
acl_run
(
ACLOperator
*
op
,
const
std
::
vector
<
framework
::
LoDTensor
*>
&
in_data
,
void
*
out_data
,
Dtype
type
,
bool
multi_input_run
=
true
)
{
for
(
int
i
=
0
;
i
<
in_data
.
size
();
++
i
)
{
op
->
tensor_mem
(
op
->
cinput
(
i
),
InputdataPtr
(
op
,
in_data
,
type
,
i
));
}
op
->
acl_run
(
NULL
,
out_data
);
}
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#ifdef USE_PROFILING
#define acl_configure(opname, acl_op, args...) \
{ \
set_operator_property(acl::operate_type_##opname, #opname); \
logtime_util log_time(ACL_CONFIG_INFO); \
instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \
acl_op->output(), acl_op->getTargetHint(), args); \
}
#else
#define acl_configure(opname, acl_op, args...) \
{ \
set_operator_property(acl::operate_type_##opname, #opname); \
instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \
acl_op->output(), acl_op->getTargetHint(), args); \
}
#endif
#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
#endif // USE_ACL
#endif // ACL_OPERATOR_H_
src/operators/kernel/mali/acl_tensor.cc
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "acl_tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
#ifdef USE_ACL
template
<
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
ITensor
>
initialise_tensor
(
arm_compute
::
TensorInfo
&
info
)
{
auto
tensor
=
cpp14
::
make_unique
<
TensorType
>
();
tensor
->
allocator
()
->
init
(
info
);
return
std
::
move
(
tensor
);
}
template
<
typename
TensorType
>
void
tensor_allocate
(
arm_compute
::
ITensor
&
tensor
)
{
auto
itensor
=
dynamic_cast
<
TensorType
*>
(
&
tensor
);
itensor
->
allocator
()
->
allocate
();
}
Tensor
::
Tensor
(
arm_compute
::
TensorInfo
&
info
)
noexcept
:
_target
(
TargetHint
::
DONT_CARE
),
_info
(
info
),
_tensor
(
nullptr
)
{}
Tensor
::
Tensor
(
Tensor
&&
src
)
noexcept
:
_target
(
src
.
_target
),
_info
(
std
::
move
(
src
.
_info
)),
_tensor
(
std
::
move
(
src
.
_tensor
))
{}
arm_compute
::
ITensor
*
Tensor
::
set_target
(
TargetHint
target
)
{
switch
(
target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
_tensor
=
initialise_tensor
<
arm_compute
::
CLTensor
>
(
_info
);
break
;
#elif defined(USE_OPENGLES)
case
TargetHint
::
OPENGLES
:
_tensor
=
initialise_tensor
<
arm_compute
::
GCTensor
>
(
_info
);
break
;
#endif
case
TargetHint
::
NEON
:
_tensor
=
initialise_tensor
<
arm_compute
::
Tensor
>
(
_info
);
break
;
default:
break
;
}
_target
=
target
;
return
_tensor
.
get
();
}
void
Tensor
::
allocate
()
{
switch
(
_target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
tensor_allocate
<
arm_compute
::
CLTensor
>
(
*
_tensor
);
break
;
#elif defined(USE_OPENGLES)
case
TargetHint
::
OPENGLES
:
tensor_allocate
<
arm_compute
::
GCTensor
>
(
*
_tensor
);
break
;
#endif
case
TargetHint
::
NEON
:
tensor_allocate
<
arm_compute
::
Tensor
>
(
*
_tensor
);
break
;
default:
break
;
}
}
void
Tensor
::
map
(
bool
blocking
)
{
#ifdef USE_OPENCL
if
(
_target
==
TargetHint
::
OPENCL
)
dynamic_cast
<
arm_compute
::
CLTensor
*>
(
tensor
())
->
map
(
blocking
);
#elif defined(USE_OPENGLES)
if
(
_target
==
TargetHint
::
OPENGLES
)
dynamic_cast
<
arm_compute
::
GCTensor
*>
(
tensor
())
->
map
(
blocking
);
#endif
}
void
Tensor
::
unmap
()
{
#ifdef USE_OPENCL
if
(
_target
==
TargetHint
::
OPENCL
)
dynamic_cast
<
arm_compute
::
CLTensor
*>
(
tensor
())
->
unmap
();
#elif defined(USE_OPENGLES)
if
(
_target
==
TargetHint
::
OPENGLES
)
dynamic_cast
<
arm_compute
::
GCTensor
*>
(
tensor
())
->
unmap
();
#endif
}
template
<
typename
SubTensorType
,
typename
ParentTensorType
>
std
::
unique_ptr
<
arm_compute
::
ITensor
>
initialise_subtensor
(
arm_compute
::
ITensor
*
parent
,
arm_compute
::
TensorShape
shape
,
arm_compute
::
Coordinates
coords
)
{
auto
ptensor
=
dynamic_cast
<
ParentTensorType
*>
(
parent
);
auto
subtensor
=
cpp14
::
make_unique
<
SubTensorType
>
(
ptensor
,
shape
,
coords
);
return
std
::
move
(
subtensor
);
}
SubTensor
::
SubTensor
(
Tensor
*
parent
,
arm_compute
::
TensorShape
&
tensor_shape
,
arm_compute
::
Coordinates
&
coords
)
noexcept
:
_target
(
TargetHint
::
DONT_CARE
),
_tensor_shape
(
tensor_shape
),
_coords
(
coords
),
_parent
(
nullptr
),
_subtensor
(
nullptr
)
{
_parent
=
parent
->
tensor
();
_target
=
parent
->
target
();
instantiate_subtensor
();
}
arm_compute
::
ITensor
*
SubTensor
::
set_target
(
TargetHint
target
)
{
return
(
target
==
_target
)
?
_subtensor
.
get
()
:
nullptr
;
}
arm_compute
::
ITensor
*
SubTensor
::
tensor
()
{
return
_subtensor
.
get
();
}
const
arm_compute
::
ITensor
*
SubTensor
::
tensor
()
const
{
return
_subtensor
.
get
();
}
TargetHint
SubTensor
::
target
()
const
{
return
_target
;
}
void
SubTensor
::
allocate
()
{
// NOP for sub-tensors
}
void
SubTensor
::
instantiate_subtensor
()
{
switch
(
_target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
_subtensor
=
initialise_subtensor
<
arm_compute
::
CLSubTensor
,
arm_compute
::
ICLTensor
>
(
_parent
,
_tensor_shape
,
_coords
);
break
;
#endif
default:
case
TargetHint
::
NEON
:
_subtensor
=
initialise_subtensor
<
arm_compute
::
SubTensor
,
arm_compute
::
ITensor
>
(
_parent
,
_tensor_shape
,
_coords
);
break
;
}
}
#endif
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
src/operators/kernel/mali/acl_tensor.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_TENSOR_H_
#define ACL_TENSOR_H_
#ifdef USE_ACL
#ifdef USE_OPENCL
#include "arm_compute/runtime/CL/CLSubTensor.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#elif defined(USE_OPENGLES)
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#endif
#include "arm_compute/runtime/SubTensor.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
enum
class
TargetHint
{
DONT_CARE
,
OPENCL
,
OPENGLES
,
NEON
,
};
enum
class
ConvolutionMethodHint
{
GEMM
,
DIRECT
,
};
namespace
cpp14
{
template
<
class
T
>
struct
_Unique_if
{
typedef
std
::
unique_ptr
<
T
>
_Single_object
;
};
template
<
class
T
>
struct
_Unique_if
<
T
[]
>
{
typedef
std
::
unique_ptr
<
T
[]
>
_Unknown_bound
;
};
template
<
class
T
,
size_t
N
>
struct
_Unique_if
<
T
[
N
]
>
{
typedef
void
_Known_bound
;
};
template
<
class
T
,
class
...
Args
>
typename
_Unique_if
<
T
>::
_Single_object
make_unique
(
Args
&&
...
args
)
{
return
std
::
unique_ptr
<
T
>
(
new
T
(
std
::
forward
<
Args
>
(
args
)...));
}
template
<
class
T
>
typename
_Unique_if
<
T
>::
_Unknown_bound
make_unique
(
size_t
n
)
{
typedef
typename
std
::
remove_extent
<
T
>::
type
U
;
return
std
::
unique_ptr
<
T
>
(
new
U
[
n
]());
}
template
<
class
T
,
class
...
Args
>
typename
_Unique_if
<
T
>::
_Known_bound
make_unique
(
Args
&&
...);
}
// namespace cpp14
class
Tensor
{
public:
explicit
Tensor
(
arm_compute
::
TensorInfo
&
info
)
noexcept
;
virtual
~
Tensor
()
{}
Tensor
(
Tensor
&&
src
)
noexcept
;
void
set_info
(
arm_compute
::
TensorInfo
&&
info
)
{
_info
=
info
;
}
arm_compute
::
ITensor
*
set_target
(
TargetHint
target
);
const
arm_compute
::
TensorInfo
&
info
()
const
{
return
_info
;
}
arm_compute
::
ITensor
*
tensor
()
{
return
_tensor
.
get
();
}
void
allocate
();
void
init
()
{}
TargetHint
target
()
const
{
return
_target
;
}
virtual
void
map
(
bool
blocking
=
true
);
virtual
void
unmap
();
private:
TargetHint
_target
;
arm_compute
::
TensorInfo
_info
;
std
::
unique_ptr
<
arm_compute
::
ITensor
>
_tensor
;
};
class
SubTensor
{
public:
SubTensor
(
Tensor
*
parent
,
arm_compute
::
TensorShape
&
tensor_shape
,
arm_compute
::
Coordinates
&
coords
)
noexcept
;
~
SubTensor
()
{}
arm_compute
::
ITensor
*
tensor
();
const
arm_compute
::
ITensor
*
tensor
()
const
;
TargetHint
target
()
const
;
void
allocate
();
arm_compute
::
ITensor
*
set_target
(
TargetHint
target
);
private:
/** Instantiates a sub-tensor */
void
instantiate_subtensor
();
private:
/**< Target that this tensor is pinned on */
TargetHint
_target
;
/**< SubTensor shape */
arm_compute
::
TensorShape
_tensor_shape
;
/**< SubTensor Coordinates */
arm_compute
::
Coordinates
_coords
;
/**< Parent tensor */
arm_compute
::
ITensor
*
_parent
;
/**< SubTensor */
std
::
unique_ptr
<
arm_compute
::
ITensor
>
_subtensor
;
};
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif // ACL_TENSOR_H_
src/operators/kernel/mali/batchnorm_kernel.cpp
浏览文件 @
dd0cf789
...
@@ -16,20 +16,152 @@ limitations under the License. */
...
@@ -16,20 +16,152 @@ limitations under the License. */
#include "operators/kernel/batchnorm_kernel.h"
#include "operators/kernel/batchnorm_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclBatchNormOp
:
public
acl
::
ACLOperator
{
public:
AclBatchNormOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_BN
;
}
~
AclBatchNormOp
()
=
default
;
AclBatchNormOp
(
const
AclBatchNormOp
&
)
=
delete
;
AclBatchNormOp
&
operator
=
(
const
AclBatchNormOp
&
)
=
delete
;
AclBatchNormOp
(
AclBatchNormOp
&&
)
=
delete
;
AclBatchNormOp
&
operator
=
(
AclBatchNormOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
BatchNormParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
arm_compute
::
TensorShape
mean_shape
(
args
.
in_depth
);
arm_compute
::
TensorShape
var_shape
=
mean_shape
;
arm_compute
::
TensorShape
beta_shape
=
mean_shape
;
arm_compute
::
TensorShape
gamma_shape
=
mean_shape
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
new_tensor
(
mean
(),
mean_shape
,
args
.
mean_data
);
new_tensor
(
var
(),
var_shape
,
args
.
var_data
);
new_tensor
(
beta
(),
beta_shape
,
args
.
biases_data
);
new_tensor
(
gamma
(),
gamma_shape
,
args
.
weight_data
);
acl_configure
(
bn
,
this
,
args
.
epsilon
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
BatchNormParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
BatchNormParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
InputX
();
Tensor
*
out
=
param
.
OutputY
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
const
Tensor
*
saved_mean
=
param
.
InputMean
();
const
Tensor
*
saved_variance
=
param
.
InputVariance
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
scale
->
data
<
T
>
();
const
T
*
bias_data
=
bias
->
data
<
T
>
();
const
T
*
mean_data
=
saved_mean
->
data
<
T
>
();
const
T
*
var_data
=
saved_variance
->
data
<
T
>
();
float
epsilon
=
param
.
Epsilon
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
// args.weight_data = (void*)weight_data;
// args.biases_data = (void*)bias_data;
args
.
mean_data
=
(
void
*
)
mean_data
;
args
.
var_data
=
(
void
*
)
var_data
;
args
.
epsilon
=
epsilon
;
args
.
dim
=
in_x
->
dims
().
size
();
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
args
.
out_num
=
out
->
dims
()[
0
];
args
.
out_depth
=
out
->
dims
()[
1
];
args
.
out_rows
=
out
->
dims
()[
2
];
args
.
out_cols
=
out
->
dims
()[
3
];
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
(
void
*
)
bias_data
;
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
template
<
>
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
para
)
const
{
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
param
)
const
{
AclBatchNormOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclBatchNormOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclBatchNormOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
BatchNormKernel
<
GPU_MALI
,
float
>::
Compute
(
void
BatchNormKernel
<
GPU_MALI
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{}
const
BatchNormParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclBatchNormOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclBatchNormOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
BatchNormKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
#endif
src/operators/kernel/mali/concat_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConcatOp
:
public
acl
::
ACLOperator
{
public:
AclConcatOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONCAT
;
}
~
AclConcatOp
()
=
default
;
AclConcatOp
(
const
AclConcatOp
&
)
=
delete
;
AclConcatOp
&
operator
=
(
const
AclConcatOp
&
)
=
delete
;
AclConcatOp
(
AclConcatOp
&&
)
=
delete
;
AclConcatOp
&
operator
=
(
AclConcatOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ConcatParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
const
std
::
vector
<
framework
::
LoDTensor
*>*
input_data
=
&
args
.
in_tensor
;
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
batch
);
if
(
is_operator_init_done
(
output_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
T
type
;
for
(
int
i
=
0
;
i
<
input_data
->
size
();
i
++
)
{
const
T
*
idata
=
(
*
input_data
)[
i
]
->
data
<
T
>
();
const
T
*
pdata
=
(
*
input_data
)[
i
]
->
data
<
T
>
();
int
in_batch
=
(
*
input_data
)[
i
]
->
dims
()[
0
];
int
in_channels
=
(
*
input_data
)[
i
]
->
dims
()[
1
];
int
in_width
=
(
*
input_data
)[
i
]
->
dims
()[
2
];
int
in_height
=
(
*
input_data
)[
i
]
->
dims
()[
3
];
arm_compute
::
TensorShape
in_shape
(
in_width
,
in_height
,
in_channels
);
new_tensor
(
cinput
(
i
),
in_shape
,
acl
::
InputdataPtr
(
this
,
args
.
in_tensor
,
type
,
i
));
}
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
concat
,
this
,
input_data
->
size
());
}
void
RunAcl
(
const
std
::
vector
<
framework
::
LoDTensor
*>&
input
,
void
*
output
)
{
T
type
;
acl
::
acl_run
(
this
,
input
,
output
,
type
);
}
bool
Bypass_acl
(
const
ConcatParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
!
args
.
is_channel_concat
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
ConcatParam
&
param
)
{
auto
inputs
=
param
.
Inputs
();
auto
*
output
=
param
.
Out
();
int64_t
axis
=
param
.
Axis
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
args
.
is_channel_concat
=
(
axis
==
1
);
args
.
in_tensor
=
inputs
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
batch
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConcatKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConcatParam
&
param
)
const
{
AclConcatOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConcatOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConcatOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConcatKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConcatParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConcatOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConcatOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
std
::
vector
<
framework
::
LoDTensor
*>
temp_data
=
args
.
in_tensor
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
(
temp_data
,
(
void
*
)
output_data
);
}
template
class
ConcatKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/conv_add_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConvAddOp
:
public
acl
::
ACLOperator
{
public:
AclConvAddOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONV
;
}
~
AclConvAddOp
()
=
default
;
AclConvAddOp
(
const
AclConvAddOp
&
)
=
delete
;
AclConvAddOp
&
operator
=
(
const
AclConvAddOp
&
)
=
delete
;
AclConvAddOp
(
AclConvAddOp
&&
)
=
delete
;
AclConvAddOp
&
operator
=
(
AclConvAddOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
FusionConvAddParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
arm_compute
::
TensorShape
weights_shape
(
args
.
filter_cols
,
args
.
filter_rows
,
args
.
in_depth
/
args
.
num_group
,
args
.
out_depth
);
arm_compute
::
TensorShape
biases_shape
(
args
.
out_depth
);
arm_compute
::
PadStrideInfo
conv_info
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
FLOOR
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
check_direct_conv
();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor
(
weights
(),
weights_shape
,
args
.
weight_data
);
//[OFM]
if
(
args
.
biases_data
)
{
new_tensor
(
biases
(),
biases_shape
,
args
.
biases_data
);
}
group
()
=
args
.
num_group
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
conv
,
this
,
conv_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
FusionConvAddParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
args
.
num_group
>=
5
)
{
bypass_acl
=
true
;
}
if
(
args
.
dim
>
2
)
{
bypass_acl
=
true
;
}
if
(
args
.
dilated
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
check_direct_conv
()
{
bool
use_direct_conv
=
false
;
const
char
*
pDirectConv
;
pDirectConv
=
getenv
(
"DIRECTCONV"
);
if
(
pDirectConv
)
{
unsigned
int
bdirectconv
;
sscanf
(
pDirectConv
,
"%i"
,
&
bdirectconv
);
if
(
bdirectconv
!=
use_direct_conv
)
{
use_direct_conv
=
bdirectconv
;
printf
(
"DIRECTCONV<%s>
\n
"
,
pDirectConv
);
printf
(
"DIRECTCONV: %x
\n
"
,
use_direct_conv
);
}
}
int
pad_data
[
2
],
kernel
[
2
];
pad_data
[
1
]
=
args
.
pad_rows
;
pad_data
[
0
]
=
args
.
pad_cols
;
kernel
[
1
]
=
args
.
filter_rows
;
kernel
[
0
]
=
args
.
filter_cols
;
if
(
use_direct_conv
&&
((
kernel
[
0
]
==
1
&&
kernel
[
1
]
==
1
&&
pad_data
[
0
]
==
0
&&
pad_data
[
1
]
==
0
)
||
(
kernel
[
0
]
==
3
&&
kernel
[
1
]
==
3
&&
pad_data
[
0
]
<=
1
&&
pad_data
[
1
]
<=
1
)))
{
setConvMethod
();
// NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void
AclParametersByContext
(
const
FusionConvAddParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
bias
;
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
filter
.
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
nullptr
;
try
{
bias
=
param
.
Bias
();
}
catch
(
const
std
::
exception
&
e
)
{
}
if
(
bias
)
{
const
T
*
biases_data
=
bias
->
data
<
T
>
();
args
.
biases_data
=
(
void
*
)
biases_data
;
}
args
.
num_group
=
groups
;
args
.
dilation_rows
=
dilations
[
0
];
args
.
dilation_cols
=
dilations
[
1
];
if
(
dilations
[
0
]
!=
1
||
dilations
[
1
]
!=
1
)
{
args
.
dilated
=
true
;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args
.
batch
=
input
->
dims
()[
0
];
args
.
in_depth
=
input
->
dims
()[
1
];
args
.
in_rows
=
input
->
dims
()[
2
];
args
.
in_cols
=
input
->
dims
()[
3
];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args
.
out_num
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args
.
filter_rows
=
filter
.
dims
()[
2
];
args
.
filter_cols
=
filter
.
dims
()[
3
];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConvAddKernel
<
GPU_MALI
,
float
>::
Init
(
const
FusionConvAddParam
&
param
)
const
{
AclConvAddOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvAddOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConvAddOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConvAddKernel
<
GPU_MALI
,
float
>::
Compute
(
const
FusionConvAddParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConvAddOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvAddOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ConvAddKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/conv_kernel.cpp
浏览文件 @
dd0cf789
...
@@ -15,20 +15,213 @@ limitations under the License. */
...
@@ -15,20 +15,213 @@ limitations under the License. */
#ifdef CONV_OP
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#include "operators/kernel/conv_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConvOp
:
public
acl
::
ACLOperator
{
public:
AclConvOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONV
;
}
~
AclConvOp
()
=
default
;
AclConvOp
(
const
AclConvOp
&
)
=
delete
;
AclConvOp
&
operator
=
(
const
AclConvOp
&
)
=
delete
;
AclConvOp
(
AclConvOp
&&
)
=
delete
;
AclConvOp
&
operator
=
(
AclConvOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ConvParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
arm_compute
::
TensorShape
weights_shape
(
args
.
filter_cols
,
args
.
filter_rows
,
args
.
in_depth
/
args
.
num_group
,
args
.
out_depth
);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
PadStrideInfo
conv_info
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
FLOOR
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
check_direct_conv
();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor
(
weights
(),
weights_shape
,
args
.
weight_data
);
//[OFM]
// if (args.biases_data) {
// new_tensor(biases(),biases_shape,args.biases_data);
//}
group
()
=
args
.
num_group
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
conv
,
this
,
conv_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
ConvParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
args
.
num_group
>=
5
)
{
bypass_acl
=
true
;
}
if
(
args
.
dim
>
2
)
{
bypass_acl
=
true
;
}
if
(
args
.
dilated
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
check_direct_conv
()
{
bool
use_direct_conv
=
false
;
const
char
*
pDirectConv
;
pDirectConv
=
getenv
(
"DIRECTCONV"
);
if
(
pDirectConv
)
{
unsigned
int
bdirectconv
;
sscanf
(
pDirectConv
,
"%i"
,
&
bdirectconv
);
if
(
bdirectconv
!=
use_direct_conv
)
{
use_direct_conv
=
bdirectconv
;
printf
(
"DIRECTCONV<%s>
\n
"
,
pDirectConv
);
printf
(
"DIRECTCONV: %x
\n
"
,
use_direct_conv
);
}
}
int
pad_data
[
2
],
kernel
[
2
];
pad_data
[
1
]
=
args
.
pad_rows
;
pad_data
[
0
]
=
args
.
pad_cols
;
kernel
[
1
]
=
args
.
filter_rows
;
kernel
[
0
]
=
args
.
filter_cols
;
if
(
use_direct_conv
&&
((
kernel
[
0
]
==
1
&&
kernel
[
1
]
==
1
&&
pad_data
[
0
]
==
0
&&
pad_data
[
1
]
==
0
)
||
(
kernel
[
0
]
==
3
&&
kernel
[
1
]
==
3
&&
pad_data
[
0
]
<=
1
&&
pad_data
[
1
]
<=
1
)))
{
setConvMethod
();
// NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void
AclParametersByContext
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
filter
.
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
nullptr
;
// try {
// bias = context.Input<framework::Tensor>("Bias");
// } catch (const std::exception& e) {
// }
// if (bias) {
// const T* biases_data = bias->data<T>();
// args.biases_data = (void*)biases_data;
// }
args
.
num_group
=
groups
;
args
.
dilation_rows
=
dilations
[
0
];
args
.
dilation_cols
=
dilations
[
1
];
if
(
dilations
[
0
]
!=
1
||
dilations
[
1
]
!=
1
)
{
args
.
dilated
=
true
;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args
.
batch
=
input
->
dims
()[
0
];
args
.
in_depth
=
input
->
dims
()[
1
];
args
.
in_rows
=
input
->
dims
()[
2
];
args
.
in_cols
=
input
->
dims
()[
3
];
std
::
cout
<<
"In N: "
<<
args
.
batch
<<
" C: "
<<
args
.
in_depth
<<
" H: "
<<
args
.
in_rows
<<
" W: "
<<
args
.
in_cols
<<
"
\n
"
;
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args
.
out_num
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args
.
filter_rows
=
filter
.
dims
()[
2
];
args
.
filter_cols
=
filter
.
dims
()[
3
];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
template
<
>
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
param
)
const
{
AclConvOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConvOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
// ArmConvImplement imp;
std
::
cout
<<
"init acl"
<<
std
::
endl
;
// imp.Compute(param);
AclConvOp
<
GPU_MALI
,
float
>*
acl_op
=
param
.
Output
()
->
mutable_data
<
float
>
()[
0
]
=
100.0
;
reinterpret_cast
<
AclConvOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
}
template
class
ConvKernel
<
GPU_MALI
,
float
>;
template
class
ConvKernel
<
GPU_MALI
,
float
>;
...
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
...
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
#endif
src/operators/kernel/mali/elementwise_add_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once
#include "operators/kernel/elementwise_add_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
T
>
struct
AddFunctor
{
inline
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
>
bool
ElementwiseAddKernel
<
GPU_MALI
,
float
>::
Init
(
const
ElementwiseAddParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ElementwiseAddKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ElementwiseAddParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
Tensor
*
Out
=
param
.
Out
();
Out
->
mutable_data
<
float
>
();
int
axis
=
param
.
Axis
();
ElementwiseComputeEx
<
AddFunctor
<
float
>
,
float
>
(
input_x
,
input_y
,
axis
,
AddFunctor
<
float
>
(),
Out
);
}
template
class
ElementwiseAddKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/fushion_fc_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
FusionFcKernel
<
GPU_MALI
,
float
>::
Init
(
const
FusionFcParam
&
para
)
const
{
return
true
;
}
template
<
>
void
FusionFcKernel
<
GPU_MALI
,
float
>::
Compute
(
const
FusionFcParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
const
Tensor
*
input_z
=
param
.
InputZ
();
auto
*
input_z_data
=
input_z
->
data
<
float
>
();
int
axis
=
param
.
Axis
();
Tensor
*
out
=
param
.
Out
();
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
:
*
input_x
;
const
Tensor
y_matrix
=
input_y
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_y
,
param
.
YNumColDims
())
:
*
input_y
;
auto
out_dim
=
out
->
dims
();
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
PADDLE_MOBILE_ENFORCE
(
input_z
->
dims
().
size
()
==
1
,
"inpu_z size must be 1"
);
PADDLE_MOBILE_ENFORCE
(
out_dim
[
1
]
==
input_z
->
dims
()[
0
],
" out_dim.size must be 2."
);
axis
=
(
axis
==
-
1
?
out_dim
.
size
()
-
input_z
->
dims
().
size
()
:
axis
);
PADDLE_MOBILE_ENFORCE
(
axis
==
1
,
" to fit broadcast, axis = 1. "
)
int64_t
classes
=
input_z
->
numel
();
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
DLOG
<<
out_data
[
i
];
}
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
));
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/lrn_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#pragma once
#include "operators/kernel/lrn_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclLrnOp
:
public
acl
::
ACLOperator
{
public:
AclLrnOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_LRN
;
}
~
AclLrnOp
()
=
default
;
AclLrnOp
(
const
AclLrnOp
&
)
=
delete
;
AclLrnOp
&
operator
=
(
const
AclLrnOp
&
)
=
delete
;
AclLrnOp
(
AclLrnOp
&&
)
=
delete
;
AclLrnOp
&
operator
=
(
AclLrnOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
LrnParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
);
if
(
is_operator_init_done
(
shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
arm_compute
::
NormalizationLayerInfo
norm_info
(
arm_compute
::
NormType
::
CROSS_MAP
,
args
.
nsize
,
args
.
alpha
,
args
.
beta
,
args
.
knorm
);
//[width, height, IFM]
new_tensor
(
input
(),
shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
shape
,
args
.
output_data
);
acl_configure
(
lrn
,
this
,
norm_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
LrnParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
LrnParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
InputX
();
Tensor
*
out
=
param
.
Out
();
int
n
=
param
.
N
();
T
alpha
=
param
.
Alpha
();
T
beta
=
param
.
Beta
();
T
k
=
param
.
K
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
nsize
=
n
;
args
.
alpha
=
alpha
;
args
.
beta
=
beta
;
args
.
knorm
=
k
;
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
LrnKernel
<
GPU_MALI
,
float
>::
Init
(
const
LrnParam
&
param
)
const
{
AclLrnOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclLrnOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclLrnOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
LrnKernel
<
GPU_MALI
,
float
>::
Compute
(
const
LrnParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclLrnOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclLrnOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
batch
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
output_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
}
}
template
class
LrnKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/mul_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#pragma once
#include "operators/kernel/mul_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
MulKernel
<
GPU_MALI
,
float
>::
Init
(
const
MulParam
&
para
)
const
{
return
true
;
}
template
<
>
void
MulKernel
<
GPU_MALI
,
float
>::
Compute
(
const
MulParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
Tensor
*
out
=
param
.
Out
();
out
->
mutable_data
<
float
>
();
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
:
*
input_x
;
const
Tensor
y_matrix
=
input_y
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_y
,
param
.
YNumColDims
())
:
*
input_y
;
auto
out_dim
=
out
->
dims
();
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
(
out_dim
);
}
}
template
class
MulKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/pool_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "operators/kernel/pool_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclPoolOp
:
public
acl
::
ACLOperator
{
public:
AclPoolOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_POOLING
;
}
~
AclPoolOp
()
=
default
;
AclPoolOp
(
const
AclPoolOp
&
)
=
delete
;
AclPoolOp
&
operator
=
(
const
AclPoolOp
&
)
=
delete
;
AclPoolOp
(
AclPoolOp
&&
)
=
delete
;
AclPoolOp
&
operator
=
(
AclPoolOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
PoolParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
PoolingLayerInfo
pool_info
;
if
(
args
.
pool_type
==
"max"
)
{
pool_info
=
arm_compute
::
PoolingLayerInfo
(
arm_compute
::
PoolingType
::
MAX
,
args
.
filter_rows
,
arm_compute
::
PadStrideInfo
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
CEIL
));
}
else
{
pool_info
=
arm_compute
::
PoolingLayerInfo
(
arm_compute
::
PoolingType
::
AVG
,
args
.
filter_rows
,
arm_compute
::
PadStrideInfo
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
CEIL
));
}
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
pooling
,
this
,
pool_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
PoolParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
if
(
args
.
pool_type
!=
"max"
&&
args
.
pool_type
!=
"avg"
)
{
bypass_acl
=
true
;
}
if
(
args
.
filter_rows
!=
args
.
filter_cols
)
{
bypass_acl
=
true
;
}
// if (args.filter_rows!=2 && args.filter_rows!=3) {
// bypass_acl = true;
// }
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
PoolParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
Input
();
Tensor
*
out
=
param
.
Output
();
std
::
string
pooling_type
=
param
.
PoolingType
();
std
::
vector
<
int
>
ksize
=
param
.
Ksize
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
bool
is_global_pooling
=
param
.
isGlobalPooling
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
is_global_pool
=
is_global_pooling
;
args
.
pool_type
=
pooling_type
;
args
.
filter_rows
=
ksize
[
0
];
args
.
filter_cols
=
ksize
[
1
];
args
.
dim
=
ksize
.
size
();
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
// std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
// << " I: " << static_cast<int>(filter->dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"PoolingType: " << args.pool_type << "\n";
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
args
.
out_depth
=
args
.
in_depth
;
// args.out_rows = out->dims()[2];
// args.out_cols = out->dims()[3];
args
.
out_rows
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
args
.
in_rows
+
2
*
args
.
pad_rows
-
args
.
filter_rows
)
/
args
.
stride_rows
))
+
1
;
args
.
out_cols
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
args
.
in_cols
+
2
*
args
.
pad_cols
-
args
.
filter_cols
)
/
args
.
stride_cols
))
+
1
;
if
(
is_global_pooling
)
{
args
.
filter_rows
=
args
.
in_rows
;
args
.
filter_cols
=
args
.
in_cols
;
args
.
pad_rows
=
0
;
args
.
pad_cols
=
0
;
}
}
acl
::
AclParameters
args
;
};
template
<
>
bool
PoolKernel
<
GPU_MALI
,
float
>::
Init
(
const
PoolParam
&
param
)
const
{
AclPoolOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclPoolOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclPoolOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
PoolKernel
<
GPU_MALI
,
float
>::
Compute
(
const
PoolParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclPoolOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclPoolOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
batch
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
output_data
+=
args
.
in_depth
*
args
.
out_cols
*
args
.
out_rows
;
}
}
template
class
PoolKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/relu_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include "operators/kernel/relu_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclReluOp
:
public
acl
::
ACLOperator
{
public:
AclReluOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_RELU
;
}
~
AclReluOp
()
=
default
;
AclReluOp
(
const
AclReluOp
&
)
=
delete
;
AclReluOp
&
operator
=
(
const
AclReluOp
&
)
=
delete
;
AclReluOp
(
AclReluOp
&&
)
=
delete
;
AclReluOp
&
operator
=
(
AclReluOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ReluParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
*
args
.
in_rows
*
args
.
in_depth
*
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
in_cols
*
args
.
in_rows
*
args
.
in_depth
*
args
.
out_num
);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
ActivationLayerInfo
::
ActivationFunction
type
;
type
=
arm_compute
::
ActivationLayerInfo
::
ActivationFunction
::
RELU
;
arm_compute
::
ActivationLayerInfo
act_info
(
type
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
activation
,
this
,
act_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
ReluParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
ReluParam
&
param
)
{
const
auto
*
input_x
=
param
.
InputX
();
auto
*
out
=
param
.
Out
();
const
T
*
input_data
=
input_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
batch
=
input_x
->
dims
()[
0
];
args
.
in_depth
=
input_x
->
dims
()[
1
];
args
.
in_rows
=
input_x
->
dims
()[
2
];
args
.
in_cols
=
input_x
->
dims
()[
3
];
args
.
out_num
=
out
->
dims
()[
0
];
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ReluKernel
<
GPU_MALI
,
float
>::
Init
(
const
ReluParam
&
param
)
const
{
AclReluOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclReluOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclReluOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ReluKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ReluParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclReluOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclReluOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ReluKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/reshape_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#pragma once
#include "operators/kernel/reshape_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ReshapeKernel
<
GPU_MALI
,
float
>::
Init
(
const
ReshapeParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ReshapeKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ReshapeParam
&
param
)
const
{
const
auto
*
input_x
=
param
.
InputX
();
const
auto
&
input_x_dims
=
input_x
->
dims
();
auto
*
out
=
param
.
Out
();
framework
::
DDim
out_dims
=
out
->
dims
();
const
auto
*
input_shape
=
param
.
InputShape
();
if
(
input_shape
)
{
auto
*
shape_data
=
input_shape
->
data
<
int
>
();
framework
::
Tensor
cpu_shape_tensor
;
auto
shape
=
std
::
vector
<
int
>
(
shape_data
,
shape_data
+
input_shape
->
numel
());
out_dims
=
ValidateShape
(
shape
,
input_x
->
dims
());
}
bool
inplace
=
param
.
Inplace
();
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
<
float
>
();
framework
::
TensorCopy
(
*
input_x
,
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
input_x
);
out
->
Resize
(
out_dims
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/softmax_kernel.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once
#include "operators/kernel/softmax_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclSoftmaxOp
:
public
acl
::
ACLOperator
{
public:
AclSoftmaxOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_SOFTMAX
;
}
~
AclSoftmaxOp
()
=
default
;
AclSoftmaxOp
(
const
AclSoftmaxOp
&
)
=
delete
;
AclSoftmaxOp
&
operator
=
(
const
AclSoftmaxOp
&
)
=
delete
;
AclSoftmaxOp
(
AclSoftmaxOp
&&
)
=
delete
;
AclSoftmaxOp
&
operator
=
(
AclSoftmaxOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
SoftmaxParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
shape
(
args
.
in_depth
,
args
.
batch
);
if
(
is_operator_init_done
(
shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
shape
,
args
.
output_data
);
acl_configure
(
softmax
,
this
,
NULL
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
SoftmaxParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
SoftmaxParam
&
param
)
{
const
framework
::
Tensor
*
in_x
=
param
.
InputX
();
framework
::
Tensor
*
out
=
param
.
Out
();
auto
x_dims
=
in_x
->
dims
();
out
->
Resize
(
x_dims
);
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
out_num
=
out
->
dims
()[
0
];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
SoftmaxKernel
<
GPU_MALI
,
float
>::
Init
(
const
SoftmaxParam
&
param
)
const
{
AclSoftmaxOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclSoftmaxOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclSoftmaxOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
SoftmaxKernel
<
GPU_MALI
,
float
>::
Compute
(
const
SoftmaxParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclSoftmaxOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclSoftmaxOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
out_num
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
;
output_data
+=
args
.
in_depth
;
}
}
template
class
SoftmaxKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/lrn_op.cpp
浏览文件 @
dd0cf789
...
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
...
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
REGISTER_OPERATOR_CPU
(
lrn
,
ops
::
LrnOp
);
REGISTER_OPERATOR_CPU
(
lrn
,
ops
::
LrnOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
lrn
);
REGISTER_OPERATOR_MALI_GPU
(
lrn
,
ops
::
LrnOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/math/depthwiseconv3x3s1p1.cpp
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/depthwiseconv3x3s1p1.h"
#include <arm_neon.h>
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
void
DepthwiseConv3x3s1p1
(
const
Tensor
*
input
,
Tensor
filter
,
Tensor
*
output
,
Tensor
bias
,
bool
if_bias
)
{
const
float
*
input_data
=
input
->
data
<
float
>
();
const
float
*
filter_data
=
filter
.
data
<
float
>
();
float
*
output_data
=
output
->
data
<
float
>
();
const
float
*
bias_data
=
bias
.
data
<
float
>
();
const
int
h
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
const
int
w
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
l
=
h
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
const
int
c
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
const
int
hxw
=
h
*
w
;
float32x4_t
vbias
=
vdupq_n_f32
(
0.0
);
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
const
float
*
filter_data_tmp
=
filter_data
;
for
(
int
j
=
0
;
j
<
c
;
++
j
)
{
if
(
if_bias
)
{
vbias
=
vdupq_n_f32
(
bias_data
[
j
]);
}
int
l_mid
=
l
-
2
;
// l=1->l_mid=-1,l=2->l_mid=0
float
w00
=
filter_data_tmp
[
0
];
float
w01
=
filter_data_tmp
[
1
];
float
w02
=
filter_data_tmp
[
2
];
float
w10
=
filter_data_tmp
[
3
];
float
w11
=
filter_data_tmp
[
4
];
float
w12
=
filter_data_tmp
[
5
];
float
w20
=
filter_data_tmp
[
6
];
float
w21
=
filter_data_tmp
[
7
];
float
w22
=
filter_data_tmp
[
8
];
output_data
[
0
]
=
w11
*
input_data
[
0
]
+
w12
*
input_data
[
1
]
+
w21
*
input_data
[
l
]
+
w22
*
input_data
[
l
+
1
]
+
bias_data
[
j
];
output_data
[
l
-
1
]
=
w10
*
input_data
[
l
-
2
]
+
w11
*
input_data
[
l
-
1
]
+
w20
*
input_data
[
2
*
l
-
2
]
+
w21
*
input_data
[
2
*
l
-
1
]
+
bias_data
[
j
];
output_data
[(
l
-
1
)
*
l
]
=
w01
*
input_data
[(
l
-
2
)
*
l
]
+
w02
*
input_data
[(
l
-
2
)
*
l
+
1
]
+
w11
*
input_data
[(
l
-
1
)
*
l
]
+
w12
*
input_data
[(
l
-
1
)
*
l
+
1
]
+
bias_data
[
j
];
output_data
[
l
*
l
-
1
]
=
w00
*
input_data
[(
l
-
2
)
*
(
l
+
1
)]
+
w01
*
input_data
[(
l
-
2
)
*
(
l
+
1
)
+
1
]
+
w10
*
input_data
[
l
*
l
-
2
]
+
w11
*
input_data
[
l
*
l
-
1
]
+
bias_data
[
j
];
for
(
int
i
=
1
;
i
<
l
-
1
;
++
i
)
{
output_data
[
i
*
l
]
=
w01
*
input_data
[
i
*
l
-
l
]
+
w02
*
input_data
[
i
*
l
-
l
+
1
]
+
w11
*
input_data
[
i
*
l
]
+
w12
*
input_data
[
i
*
l
+
1
]
+
w21
*
input_data
[
i
*
l
+
l
]
+
w22
*
input_data
[
i
*
l
+
l
+
1
]
+
bias_data
[
j
];
output_data
[
i
*
l
+
l
-
1
]
=
w00
*
input_data
[
i
*
l
+
l
-
1
-
l
-
1
]
+
w01
*
input_data
[
i
*
l
+
l
-
1
-
l
]
+
w10
*
input_data
[
i
*
l
+
l
-
1
-
1
]
+
w11
*
input_data
[
i
*
l
+
l
-
1
]
+
w20
*
input_data
[
i
*
l
+
l
-
1
+
l
-
1
]
+
w21
*
input_data
[
i
*
l
+
l
-
1
+
l
]
+
bias_data
[
j
];
}
// top 1 row and bottom 1 row
const
float
*
input_tmp
=
input_data
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
out0
;
in0
=
vld1q_f32
(
input_tmp
);
in2
=
vld1q_f32
(
input_tmp
+
l
);
const
float
*
input_tmp_end
=
input_tmp
+
(
l
-
2
)
*
l
;
in4
=
vld1q_f32
(
input_tmp_end
);
in6
=
vld1q_f32
(
input_tmp_end
+
l
);
int
c_mid
=
l_mid
;
auto
output_ptr
=
output_data
+
1
;
for
(;
c_mid
>
3
;
c_mid
-=
4
)
{
in1
=
vld1q_f32
(
input_tmp
+
4
);
in3
=
vld1q_f32
(
input_tmp
+
l
+
4
);
tmp0
=
vextq_f32
(
in0
,
in1
,
1
);
tmp1
=
vextq_f32
(
in0
,
in1
,
2
);
tmp2
=
vextq_f32
(
in2
,
in3
,
1
);
tmp3
=
vextq_f32
(
in2
,
in3
,
2
);
out0
=
vmulq_n_f32
(
in0
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
vst1q_f32
(
output_ptr
,
out0
);
in5
=
vld1q_f32
(
input_tmp_end
+
4
);
in7
=
vld1q_f32
(
input_tmp_end
+
l
+
4
);
tmp0
=
vextq_f32
(
in4
,
in5
,
1
);
tmp1
=
vextq_f32
(
in4
,
in5
,
2
);
tmp2
=
vextq_f32
(
in6
,
in7
,
1
);
tmp3
=
vextq_f32
(
in6
,
in7
,
2
);
out0
=
vmulq_n_f32
(
in4
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in6
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vaddq_f32
(
out0
,
vbias
);
vst1q_f32
(
output_ptr
+
(
l
-
1
)
*
l
,
out0
);
// can optimize to each 8 stride.
input_tmp
+=
4
;
input_tmp_end
+=
4
;
output_ptr
+=
4
;
in0
=
in1
;
in2
=
in3
;
in4
=
in5
;
in6
=
in7
;
}
// top right pad
float32x4_t
pad0
=
vdupq_n_f32
(
input_data
[
l
-
1
]);
float32x4_t
pad1
=
vdupq_n_f32
(
input_data
[
2
*
l
-
1
]);
tmp0
=
vextq_f32
(
in0
,
pad0
,
1
);
tmp1
=
vextq_f32
(
in0
,
pad0
,
2
);
tmp2
=
vextq_f32
(
in2
,
pad1
,
1
);
tmp3
=
vextq_f32
(
in2
,
pad1
,
2
);
out0
=
vmulq_n_f32
(
in0
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
2
);
}
}
// bottom right pad
float32x4_t
pad2
=
vdupq_n_f32
(
input_data
[
l
*
l
-
1
-
l
]);
float32x4_t
pad3
=
vdupq_n_f32
(
input_data
[
l
*
l
-
1
]);
tmp0
=
vextq_f32
(
in4
,
pad2
,
1
);
tmp1
=
vextq_f32
(
in4
,
pad2
,
2
);
tmp2
=
vextq_f32
(
in6
,
pad3
,
1
);
tmp3
=
vextq_f32
(
in6
,
pad3
,
2
);
out0
=
vmulq_n_f32
(
in4
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in6
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vaddq_f32
(
out0
,
vbias
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
(
l
-
1
)
*
l
+
i
,
out0
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
(
l
-
1
)
*
l
+
i
,
out0
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
(
l
-
1
)
*
l
+
i
,
out0
,
2
);
}
}
// mid
for
(
int
i
=
0
;
i
<
l
-
2
;
++
i
)
{
auto
output_ptr
=
output_data
+
(
i
+
1
)
*
l
+
1
;
input_tmp
=
input_data
+
i
*
l
;
auto
in0_tmp
=
vld1q_f32
(
input_tmp
);
auto
in2_tmp
=
vld1q_f32
(
input_tmp
+
l
);
auto
in4_tmp
=
vld1q_f32
(
input_tmp
+
l
+
l
);
c_mid
=
l_mid
;
for
(;
c_mid
>
3
;
c_mid
-=
4
)
{
auto
in1_tmp
=
vld1q_f32
(
input_tmp
+
4
);
auto
in3_tmp
=
vld1q_f32
(
input_tmp
+
l
+
4
);
auto
in5_tmp
=
vld1q_f32
(
input_tmp
+
l
+
l
+
4
);
tmp0
=
vextq_f32
(
in0_tmp
,
in1_tmp
,
1
);
tmp1
=
vextq_f32
(
in0_tmp
,
in1_tmp
,
2
);
tmp2
=
vextq_f32
(
in2_tmp
,
in3_tmp
,
1
);
tmp3
=
vextq_f32
(
in2_tmp
,
in3_tmp
,
2
);
tmp4
=
vextq_f32
(
in4_tmp
,
in5_tmp
,
1
);
tmp5
=
vextq_f32
(
in4_tmp
,
in5_tmp
,
2
);
out0
=
vmulq_n_f32
(
in0_tmp
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in2_tmp
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in4_tmp
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp4
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp5
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
vst1q_f32
(
output_ptr
,
out0
);
output_ptr
+=
4
;
input_tmp
+=
4
;
in0_tmp
=
in1_tmp
;
in2_tmp
=
in3_tmp
;
in4_tmp
=
in5_tmp
;
}
float32x4_t
pad0
=
vdupq_n_f32
(
input_data
[
i
*
l
+
l
-
1
]);
float32x4_t
pad1
=
vdupq_n_f32
(
input_data
[
i
*
l
+
l
-
1
+
l
]);
float32x4_t
pad2
=
vdupq_n_f32
(
input_data
[
i
*
l
+
l
-
1
+
l
+
l
]);
tmp0
=
vextq_f32
(
in0_tmp
,
pad0
,
1
);
tmp1
=
vextq_f32
(
in0_tmp
,
pad0
,
2
);
tmp2
=
vextq_f32
(
in2_tmp
,
pad1
,
1
);
tmp3
=
vextq_f32
(
in2_tmp
,
pad1
,
2
);
tmp4
=
vextq_f32
(
in4_tmp
,
pad2
,
1
);
tmp5
=
vextq_f32
(
in4_tmp
,
pad2
,
2
);
out0
=
vmulq_n_f32
(
in0_tmp
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in2_tmp
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in4_tmp
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp4
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp5
,
w22
);
out0
=
vaddq_f32
(
out0
,
vbias
);
for
(
int
i
=
0
;
i
<
c_mid
;
++
i
)
{
if
(
i
==
0
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
0
);
}
if
(
i
==
1
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
1
);
}
if
(
i
==
2
)
{
vst1q_lane_f32
(
output_ptr
+
i
,
out0
,
2
);
}
}
}
output_data
+=
hxw
;
input_data
+=
hxw
;
filter_data_tmp
+=
9
;
}
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/depthwiseconv3x3s1p1.h
0 → 100644
浏览文件 @
dd0cf789
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
Tensor
;
void
DepthwiseConv3x3s1p1
(
const
Tensor
*
input
,
Tensor
filter
,
Tensor
*
output
,
Tensor
bias
,
bool
if_bias
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/mul_op.cpp
浏览文件 @
dd0cf789
...
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
...
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
REGISTER_OPERATOR_CPU
(
mul
,
ops
::
MulOp
);
REGISTER_OPERATOR_CPU
(
mul
,
ops
::
MulOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
mul
);
REGISTER_OPERATOR_MALI_GPU
(
mul
,
ops
::
MulOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/pool_op.cpp
浏览文件 @
dd0cf789
...
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
...
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
REGISTER_OPERATOR_CPU
(
pool2d
,
ops
::
PoolOp
);
REGISTER_OPERATOR_CPU
(
pool2d
,
ops
::
PoolOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
pool2d
);
REGISTER_OPERATOR_MALI_GPU
(
pool2d
,
ops
::
PoolOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/relu_op.cpp
浏览文件 @
dd0cf789
...
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
...
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
REGISTER_OPERATOR_CPU
(
relu
,
ops
::
ReluOp
);
REGISTER_OPERATOR_CPU
(
relu
,
ops
::
ReluOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
relu
);
REGISTER_OPERATOR_MALI_GPU
(
relu
,
ops
::
ReluOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/reshape_op.cpp
浏览文件 @
dd0cf789
...
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
...
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
REGISTER_OPERATOR_CPU
(
reshape
,
ops
::
ReshapeOp
);
REGISTER_OPERATOR_CPU
(
reshape
,
ops
::
ReshapeOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
reshape
);
REGISTER_OPERATOR_MALI_GPU
(
reshape
,
ops
::
ReshapeOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
src/operators/softmax_op.cpp
浏览文件 @
dd0cf789
...
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
...
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
REGISTER_OPERATOR_CPU
(
softmax
,
ops
::
SoftmaxOp
);
REGISTER_OPERATOR_CPU
(
softmax
,
ops
::
SoftmaxOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
softmax
);
REGISTER_OPERATOR_MALI_GPU
(
softmax
,
ops
::
SoftmaxOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...
...
tools/push2android.sh
→
tools/
android-debug-script/
push2android.sh
浏览文件 @
dd0cf789
#!/usr/bin/env sh
#!/usr/bin/env sh
push_fn
()
{
push_fn
()
{
MODELS_PATH
=
"../test/models/*"
MODELS_PATH
=
"../
../
test/models/*"
MODELS_SRC
=
"../test/models"
MODELS_SRC
=
"../
../
test/models"
IMAGE_PATH
=
"../test/images/*"
IMAGE_PATH
=
"../
../
test/images/*"
EXE_FILE
=
"../test/build/*"
EXE_FILE
=
"../
../
test/build/*"
EXE_DIR
=
"data/local/tmp/bin"
EXE_DIR
=
"data/local/tmp/bin"
adb shell
mkdir
${
EXE_DIR
}
adb shell
mkdir
${
EXE_DIR
}
MODELS_DIR
=
"data/local/tmp/models"
MODELS_DIR
=
"data/local/tmp/models"
...
@@ -14,9 +14,14 @@ do
...
@@ -14,9 +14,14 @@ do
adb shell
mkdir
${
MODELS_DIR
}
"/"
${
file
}
adb shell
mkdir
${
MODELS_DIR
}
"/"
${
file
}
done
done
if
[[
-d
"../../src/operators/kernel/mali/ACL_Android/build"
]]
;
then
ACL_BUILD_PATH
=
"../../src/operators/kernel/mali/ACL_Android/build/*"
adb push
${
ACL_BUILD_PATH
}
${
EXE_DIR
}
fi
IMAGES_DIR
=
"data/local/tmp/images"
IMAGES_DIR
=
"data/local/tmp/images"
adb shell
mkdir
${
IMAGES_DIR
}
adb shell
mkdir
${
IMAGES_DIR
}
LIB_PATH
=
"../build/release/arm-v7a/build/*"
LIB_PATH
=
"../
../
build/release/arm-v7a/build/*"
adb push
${
EXE_FILE
}
${
EXE_DIR
}
adb push
${
EXE_FILE
}
${
EXE_DIR
}
adb push
${
LIB_PATH
}
${
EXE_DIR
}
adb push
${
LIB_PATH
}
${
EXE_DIR
}
if
[[
$1
!=
"npm"
]]
;
then
if
[[
$1
!=
"npm"
]]
;
then
...
...
tools/
scripts
/run_on_android.sh
→
tools/
android-debug-script
/run_on_android.sh
浏览文件 @
dd0cf789
文件已移动
tools/build.sh
浏览文件 @
dd0cf789
...
@@ -56,7 +56,7 @@ build_for_android() {
...
@@ -56,7 +56,7 @@ build_for_android() {
MODE
=
"Release"
MODE
=
"Release"
ANDROID_PLATFORM_VERSION
=
"android-
15
"
ANDROID_PLATFORM_VERSION
=
"android-
22
"
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE
=
"arm"
ANDROID_ARM_MODE
=
"arm"
if
[
$#
-eq
1
]
;
then
if
[
$#
-eq
1
]
;
then
...
...
tools/run.sh
已删除
100644 → 0
浏览文件 @
97102280
#!/usr/bin/env sh
# auto build and run
BUILDNET
=
"mobilenetssd"
TESTUNIT
=
"test-mobilenetssd"
push_fn
()
{
sh build.sh android
${
BUILDNET
}
MODELS_PATH
=
"../test/models/*"
MODELS_SRC
=
"../test/models"
IMAGE_PATH
=
"../test/images/*"
EXE_FILE
=
"../test/build/*"
EXE_DIR
=
"data/local/tmp/bin"
adb shell
mkdir
${
EXE_DIR
}
MODELS_DIR
=
"data/local/tmp/models"
adb shell
mkdir
${
MODELS_DIR
}
for
file
in
`
ls
${
MODELS_SRC
}
`
do
adb shell
mkdir
${
MODELS_DIR
}
"/"
${
file
}
done
IMAGES_DIR
=
"data/local/tmp/images"
adb shell
mkdir
${
IMAGES_DIR
}
LIB_PATH
=
"../build/release/arm-v7a/build/*"
adb push
${
EXE_FILE
}
${
EXE_DIR
}
adb push
${
LIB_PATH
}
${
EXE_DIR
}
if
[[
$1
!=
"npm"
]]
;
then
adb push
${
IMAGE_PATH
}
${
IMAGES_DIR
}
adb push
${
MODELS_PATH
}
${
MODELS_DIR
}
fi
adb shell
"cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./
${
TESTUNIT
}
"
}
if
[[
$1
==
"npm"
]]
;
then
push_fn
$1
else
push_fn
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录