Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
bec8987d
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
bec8987d
编写于
6月 28, 2018
作者:
S
sharper
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ready for PR
上级
0342c83c
变更
34
展开全部
隐藏空白更改
内联
并排
Showing
34 changed file
with
3280 addition
and
13 deletion
+3280
-13
.gitmodules
.gitmodules
+3
-0
CMakeLists.txt
CMakeLists.txt
+12
-1
src/framework/operator.h
src/framework/operator.h
+12
-0
src/operators/batchnorm_op.cpp
src/operators/batchnorm_op.cpp
+2
-0
src/operators/concat_op.cpp
src/operators/concat_op.cpp
+2
-0
src/operators/elementwise_add_op.cpp
src/operators/elementwise_add_op.cpp
+2
-0
src/operators/feed_op.h
src/operators/feed_op.h
+2
-0
src/operators/fetch_op.h
src/operators/fetch_op.h
+2
-0
src/operators/fusion_conv_add.cpp
src/operators/fusion_conv_add.cpp
+2
-0
src/operators/fusion_fc_op.cpp
src/operators/fusion_fc_op.cpp
+2
-0
src/operators/kernel/mali/ACL_Android
src/operators/kernel/mali/ACL_Android
+1
-0
src/operators/kernel/mali/acl_operator.cc
src/operators/kernel/mali/acl_operator.cc
+212
-0
src/operators/kernel/mali/acl_operator.h
src/operators/kernel/mali/acl_operator.h
+1136
-0
src/operators/kernel/mali/acl_tensor.cc
src/operators/kernel/mali/acl_tensor.cc
+152
-0
src/operators/kernel/mali/acl_tensor.h
src/operators/kernel/mali/acl_tensor.h
+120
-0
src/operators/kernel/mali/batchnorm_kernel.cpp
src/operators/kernel/mali/batchnorm_kernel.cpp
+135
-3
src/operators/kernel/mali/concat_kernel.cpp
src/operators/kernel/mali/concat_kernel.cpp
+140
-0
src/operators/kernel/mali/conv_add_kernel.cpp
src/operators/kernel/mali/conv_add_kernel.cpp
+235
-0
src/operators/kernel/mali/conv_kernel.cpp
src/operators/kernel/mali/conv_kernel.cpp
+199
-5
src/operators/kernel/mali/elementwise_add_kernel.cpp
src/operators/kernel/mali/elementwise_add_kernel.cpp
+52
-0
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+77
-0
src/operators/kernel/mali/lrn_kernel.cpp
src/operators/kernel/mali/lrn_kernel.cpp
+148
-0
src/operators/kernel/mali/mul_kernel.cpp
src/operators/kernel/mali/mul_kernel.cpp
+59
-0
src/operators/kernel/mali/pool_kernel.cpp
src/operators/kernel/mali/pool_kernel.cpp
+220
-0
src/operators/kernel/mali/relu_kernel.cpp
src/operators/kernel/mali/relu_kernel.cpp
+136
-0
src/operators/kernel/mali/reshape_kernel.cpp
src/operators/kernel/mali/reshape_kernel.cpp
+60
-0
src/operators/kernel/mali/softmax_kernel.cpp
src/operators/kernel/mali/softmax_kernel.cpp
+137
-0
src/operators/lrn_op.cpp
src/operators/lrn_op.cpp
+2
-0
src/operators/mul_op.cpp
src/operators/mul_op.cpp
+2
-0
src/operators/pool_op.cpp
src/operators/pool_op.cpp
+2
-0
src/operators/relu_op.cpp
src/operators/relu_op.cpp
+2
-0
src/operators/reshape_op.cpp
src/operators/reshape_op.cpp
+2
-0
src/operators/softmax_op.cpp
src/operators/softmax_op.cpp
+2
-0
tools/build.sh
tools/build.sh
+8
-4
未找到文件。
.gitmodules
0 → 100644
浏览文件 @
bec8987d
[submodule "src/operators/kernel/mali/ACL_Android"]
path = src/operators/kernel/mali/ACL_Android
url = https://github.com/halsay/ACL_Android.git
CMakeLists.txt
浏览文件 @
bec8987d
...
...
@@ -6,7 +6,7 @@ option(USE_OPENMP "openmp support" OFF)
option
(
USE_EXCEPTION
"use std exception"
ON
)
option
(
LOG_PROFILE
"log profile"
ON
)
# select the platform to build
option
(
CPU
"cpu"
O
N
)
option
(
CPU
"cpu"
O
FF
)
option
(
MALI_GPU
"mali gpu"
ON
)
option
(
FPGA
"fpga"
OFF
)
...
...
@@ -16,6 +16,17 @@ endif()
if
(
MALI_GPU
)
add_definitions
(
-DPADDLE_MOBILE_MALI_GPU
)
add_definitions
(
-DUSE_ACL=1
)
add_definitions
(
-DUSE_OPENCL
)
set
(
ACL_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/operators/kernel/mali/ACL_Android
)
include_directories
(
${
ACL_ROOT
}
${
ACL_ROOT
}
/include
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-L
${
ACL_ROOT
}
/build"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute_core"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-larm_compute_graph"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-L
${
ACL_ROOT
}
/build/opencl-1.2-stubs"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-lOpenCL"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-DUSE_ACL=1"
)
endif
()
if
(
FPGA
)
...
...
src/framework/operator.h
浏览文件 @
bec8987d
...
...
@@ -138,9 +138,21 @@ class OpKernelBase {
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase
()
{
acl_op_
=
nullptr
;
}
void
*
GetAclOp
()
const
{
return
acl_op_
;
}
void
SetAclOp
(
void
*
op
,
void
*
ob
)
const
{
reinterpret_cast
<
OpKernelBase
<
Dtype
,
P
>
*>
(
ob
)
->
acl_op_
=
op
;
}
#endif
virtual
void
Compute
(
const
P
&
para
)
const
=
0
;
virtual
bool
Init
(
const
P
&
para
)
const
{
return
true
;
};
virtual
~
OpKernelBase
()
=
default
;
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void
*
acl_op_
;
#endif
};
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
...
...
src/operators/batchnorm_op.cpp
浏览文件 @
bec8987d
...
...
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
REGISTER_OPERATOR_CPU
(
batch_norm
,
ops
::
BatchNormOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
batch_norm
);
REGISTER_OPERATOR_MALI_GPU
(
batch_norm
,
ops
::
BatchNormOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/concat_op.cpp
浏览文件 @
bec8987d
...
...
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
REGISTER_OPERATOR_CPU
(
concat
,
ops
::
ConcatOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
concat
);
REGISTER_OPERATOR_MALI_GPU
(
concat
,
ops
::
ConcatOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/elementwise_add_op.cpp
浏览文件 @
bec8987d
...
...
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
REGISTER_OPERATOR_CPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
elementwise_add
);
REGISTER_OPERATOR_MALI_GPU
(
elementwise_add
,
ops
::
ElementwiseAddOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/feed_op.h
浏览文件 @
bec8987d
...
...
@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
REGISTER_OPERATOR_CPU
(
feed
,
ops
::
FeedOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
feed
);
REGISTER_OPERATOR_MALI_GPU
(
feed
,
ops
::
FeedOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fetch_op.h
浏览文件 @
bec8987d
...
...
@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
REGISTER_OPERATOR_CPU
(
fetch
,
ops
::
FetchOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
fetch
);
REGISTER_OPERATOR_MALI_GPU
(
fetch
,
ops
::
FetchOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fusion_conv_add.cpp
浏览文件 @
bec8987d
...
...
@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
REGISTER_OPERATOR_CPU
(
conv_add
,
ops
::
FusionConvAddOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
conv_add
);
REGISTER_OPERATOR_MALI_GPU
(
conv_add
,
ops
::
FusionConvAddOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/fusion_fc_op.cpp
浏览文件 @
bec8987d
...
...
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
REGISTER_OPERATOR_CPU
(
fc
,
ops
::
FusionFcOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
fc
);
REGISTER_OPERATOR_MALI_GPU
(
fc
,
ops
::
FusionFcOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
ACL_Android
@
591027fc
Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
src/operators/kernel/mali/acl_operator.cc
0 → 100644
浏览文件 @
bec8987d
/*!
* Copyright (c) 2016 by Contributors
* \file acl_operator.cc
* \brief
* \author Joey
*/
#if USE_ACL == 1
#include "acl_operator.h"
unsigned
int
bypass_acl_class_layer
=
(
0
|
FLAGS_ENABLE_ACL_CONCAT
|
/*0xffffffff |*/
/*FLAGS_ENABLE_ACL_FC |*/
/*FLAGS_ENABLE_ACL_LRN
|*/
0
);
int
enable_schedule
=
0
;
#ifdef USE_PROFILING
#include "arm_neon.h"
unsigned
int
acl_log_flags
=
(
0
|
MASK_LOG_APP_TIME
|
/*MASK_LOG_ALLOCATE | */
/*MASK_LOG_ALLOCATE | */
/*MASK_LOG_RUN | */
/*MASK_LOG_CONFIG | */
/*MASK_LOG_COPY | */
MASK_LOG_ABSVAL
|
MASK_LOG_BNLL
|
MASK_LOG_CONV
|
MASK_LOG_FC
|
MASK_LOG_LRN
|
MASK_LOG_POOLING
|
MASK_LOG_RELU
|
MASK_LOG_SIGMOID
|
MASK_LOG_SOFTMAX
|
MASK_LOG_TANH
|
MASK_LOG_LC
|
MASK_LOG_BN
|
MASK_LOG_CONCAT
|
0
);
#include <stdio.h>
/* printf */
#include <stdlib.h>
/* getenv */
#endif // USE_PROFILING
static
bool
force_enable_gpu
=
false
;
bool
AclEnableSchedule
(
int
enable
)
{
enable_schedule
=
enable
;
if
(
enable
)
{
force_enable_gpu
=
true
;
}
return
true
;
}
int
isScheduleEnable
()
{
return
enable_schedule
;
}
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
bool
ACLOperator
::
init_gpu_env
=
true
;
#ifdef USE_OPENCL
bool
ACLOperator
::
support_opencl_
=
false
;
bool
opencl_is_available
()
{
return
arm_compute
::
opencl_is_available
();
}
#elif defined(USE_OPENGLES)
bool
ACLOperator
::
support_opengles_
=
false
;
#endif
ACLOperator
::
ACLOperator
(
bool
is_gpu
)
:
operator_state_
(
operator_not_init
),
force_bypass_acl_path_
(
false
),
target_hint_
(
TargetHint
::
DONT_CARE
),
convolution_method_hint_
(
ConvolutionMethodHint
::
GEMM
),
_group
(
1
),
name_
(
""
),
input_idx_
(
0
),
output_idx_
(
0
),
is_gpu_
(
is_gpu
)
{
const
char
*
pBypassACL
;
if
(
init_gpu_env
)
{
#ifdef USE_OPENCL
try
{
if
(
opencl_is_available
())
{
arm_compute
::
CLScheduler
::
get
().
default_init
();
support_opencl_
=
true
;
}
}
catch
(
std
::
exception
&
e
)
{
support_opencl_
=
false
;
}
#elif defined(USE_OPENGLES)
try
{
arm_compute
::
GCScheduler
::
get
().
default_init
();
support_opengles_
=
true
;
}
catch
(
std
::
exception
&
e
)
{
support_opengles_
=
false
;
}
#endif
init_gpu_env
=
false
;
}
if
(
force_enable_gpu
)
is_gpu_
=
true
;
pBypassACL
=
getenv
(
"BYPASSACL"
);
if
(
pBypassACL
)
{
unsigned
int
bacl
;
sscanf
(
pBypassACL
,
"%i"
,
&
bacl
);
if
(
bacl
!=
bypass_acl_class_layer
)
{
bypass_acl_class_layer
=
bacl
;
printf
(
"BYPASSACL<%s>
\n
"
,
pBypassACL
);
printf
(
"BYPASSACL: %x
\n
"
,
bypass_acl_class_layer
);
}
}
#ifdef USE_PROFILING
const
char
*
pLogACL
;
pLogACL
=
getenv
(
"LOGACL"
);
if
(
pLogACL
)
{
unsigned
int
alf
;
sscanf
(
pLogACL
,
"%i"
,
&
alf
);
if
(
alf
!=
acl_log_flags
)
{
acl_log_flags
=
alf
;
printf
(
"LOGACL<%s>
\n
"
,
pLogACL
);
printf
(
"LOGACL: %x
\n
"
,
acl_log_flags
);
}
}
#endif // USE_PROFILING
const
char
*
pEnableSchedule
;
pEnableSchedule
=
getenv
(
"ENABLESCHEDULE"
);
if
(
pEnableSchedule
)
{
int
bshedule
;
sscanf
(
pEnableSchedule
,
"%i"
,
&
bshedule
);
if
(
bshedule
!=
enable_schedule
)
{
enable_schedule
=
bshedule
;
printf
(
"ENABLESCHEDULE<%s>
\n
"
,
pEnableSchedule
);
printf
(
"ENABLESCHEDULE: %x
\n
"
,
enable_schedule
);
}
if
(
enable_schedule
)
{
AclEnableSchedule
(
1
);
}
}
}
ACLOperator
::~
ACLOperator
()
{}
bool
ACLOperator
::
new_tensor
(
std
::
unique_ptr
<
ACLTensor
>&
tensor
,
arm_compute
::
TensorShape
&
shape
,
void
*
mem
,
bool
commit
)
{
auto
acl_tensor
=
new
ACLTensor
(
arm_compute
::
TensorInfo
(
shape
,
arm_compute
::
Format
::
F32
));
acl_tensor
->
set_target
(
getTargetHint
());
acl_tensor
->
bindmem
(
mem
);
if
(
commit
)
acl_tensor
->
commit
();
tensor
=
(
std
::
unique_ptr
<
ACLTensor
>
)
std
::
move
(
acl_tensor
);
return
true
;
}
bool
ACLOperator
::
new_tensor
(
std
::
unique_ptr
<
ACLSubTensor
>&
tensor
,
std
::
unique_ptr
<
ACLTensor
>&
parent
,
arm_compute
::
TensorShape
&
shape
,
arm_compute
::
Coordinates
&
coord
)
{
auto
acl_tensor
=
new
ACLSubTensor
(
parent
,
shape
,
coord
);
acl_tensor
->
set_target
(
getTargetHint
());
tensor
=
(
std
::
unique_ptr
<
ACLSubTensor
>
)
std
::
move
(
acl_tensor
);
return
true
;
}
void
ACLTensor
::
commit
(
TensorType
type
)
{
settensortype
(
type
);
if
(
mem_
)
{
if
(
!
allocate_
)
{
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_ALLOCATE_INFO
);
#endif // USE_PROFILING
allocate
();
allocate_
=
true
;
}
if
(
type_
!=
tensor_output
)
{
tensor_copy
(
mem_
);
}
mem_
=
nullptr
;
}
}
int
BaseACLTensor
::
tensor_copy
(
arm_compute
::
ITensor
*
tensor
,
void
*
mem
,
bool
toTensor
)
{
#ifdef USE_PROFILING
logtime_util
log_time
(
ACL_COPY_INFO
);
#endif // USE_PROFILING
arm_compute
::
Window
window
;
// Iterate through the rows (not each element)
window
.
use_tensor_dimensions
(
tensor
->
info
()
->
tensor_shape
(),
/* first_dimension =*/
arm_compute
::
Window
::
DimY
);
int
width
=
tensor
->
info
()
->
tensor_shape
()[
0
];
int
height
=
tensor
->
info
()
->
tensor_shape
()[
1
];
int
deepth
=
tensor
->
info
()
->
tensor_shape
()[
2
];
map
();
// Create an iterator:
arm_compute
::
Iterator
it
(
tensor
,
window
);
// Except it works for an arbitrary number of dimensions
if
(
toTensor
)
{
// mem->tensor
arm_compute
::
execute_window_loop
(
window
,
[
&
](
const
arm_compute
::
Coordinates
&
id
)
{
memcpy
(
it
.
ptr
(),
((
char
*
)
mem
)
+
((
id
[
3
]
*
(
width
*
height
*
deepth
)
+
id
.
z
()
*
(
width
*
height
)
+
id
.
y
()
*
width
+
id
.
x
())
*
tensor
->
info
()
->
element_size
()),
width
*
tensor
->
info
()
->
element_size
());
},
it
);
}
else
{
// tensor-->mem
arm_compute
::
execute_window_loop
(
window
,
[
&
](
const
arm_compute
::
Coordinates
&
id
)
{
memcpy
(((
char
*
)
mem
)
+
((
id
[
3
]
*
(
width
*
height
*
deepth
)
+
id
.
z
()
*
(
width
*
height
)
+
id
.
y
()
*
width
)
*
tensor
->
info
()
->
element_size
()),
it
.
ptr
(),
width
*
tensor
->
info
()
->
element_size
());
},
it
);
}
unmap
();
return
0
;
}
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/acl_operator.h
0 → 100644
浏览文件 @
bec8987d
此差异已折叠。
点击以展开。
src/operators/kernel/mali/acl_tensor.cc
0 → 100644
浏览文件 @
bec8987d
/*!
* Copyright (c) 2016 by Contributors
* \file acl_tensor.cc
* \brief
* \author Joey
*/
#include "acl_tensor.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
#ifdef USE_ACL
template
<
typename
TensorType
>
std
::
unique_ptr
<
arm_compute
::
ITensor
>
initialise_tensor
(
arm_compute
::
TensorInfo
&
info
)
{
auto
tensor
=
cpp14
::
make_unique
<
TensorType
>
();
tensor
->
allocator
()
->
init
(
info
);
return
std
::
move
(
tensor
);
}
template
<
typename
TensorType
>
void
tensor_allocate
(
arm_compute
::
ITensor
&
tensor
)
{
auto
itensor
=
dynamic_cast
<
TensorType
*>
(
&
tensor
);
itensor
->
allocator
()
->
allocate
();
}
Tensor
::
Tensor
(
arm_compute
::
TensorInfo
&
info
)
noexcept
:
_target
(
TargetHint
::
DONT_CARE
),
_info
(
info
),
_tensor
(
nullptr
)
{}
Tensor
::
Tensor
(
Tensor
&&
src
)
noexcept
:
_target
(
src
.
_target
),
_info
(
std
::
move
(
src
.
_info
)),
_tensor
(
std
::
move
(
src
.
_tensor
))
{}
arm_compute
::
ITensor
*
Tensor
::
set_target
(
TargetHint
target
)
{
switch
(
target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
_tensor
=
initialise_tensor
<
arm_compute
::
CLTensor
>
(
_info
);
break
;
#elif defined(USE_OPENGLES)
case
TargetHint
::
OPENGLES
:
_tensor
=
initialise_tensor
<
arm_compute
::
GCTensor
>
(
_info
);
break
;
#endif
case
TargetHint
::
NEON
:
_tensor
=
initialise_tensor
<
arm_compute
::
Tensor
>
(
_info
);
break
;
default:
break
;
}
_target
=
target
;
return
_tensor
.
get
();
}
void
Tensor
::
allocate
()
{
switch
(
_target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
tensor_allocate
<
arm_compute
::
CLTensor
>
(
*
_tensor
);
break
;
#elif defined(USE_OPENGLES)
case
TargetHint
::
OPENGLES
:
tensor_allocate
<
arm_compute
::
GCTensor
>
(
*
_tensor
);
break
;
#endif
case
TargetHint
::
NEON
:
tensor_allocate
<
arm_compute
::
Tensor
>
(
*
_tensor
);
break
;
default:
break
;
}
}
void
Tensor
::
map
(
bool
blocking
)
{
#ifdef USE_OPENCL
if
(
_target
==
TargetHint
::
OPENCL
)
dynamic_cast
<
arm_compute
::
CLTensor
*>
(
tensor
())
->
map
(
blocking
);
#elif defined(USE_OPENGLES)
if
(
_target
==
TargetHint
::
OPENGLES
)
dynamic_cast
<
arm_compute
::
GCTensor
*>
(
tensor
())
->
map
(
blocking
);
#endif
}
void
Tensor
::
unmap
()
{
#ifdef USE_OPENCL
if
(
_target
==
TargetHint
::
OPENCL
)
dynamic_cast
<
arm_compute
::
CLTensor
*>
(
tensor
())
->
unmap
();
#elif defined(USE_OPENGLES)
if
(
_target
==
TargetHint
::
OPENGLES
)
dynamic_cast
<
arm_compute
::
GCTensor
*>
(
tensor
())
->
unmap
();
#endif
}
template
<
typename
SubTensorType
,
typename
ParentTensorType
>
std
::
unique_ptr
<
arm_compute
::
ITensor
>
initialise_subtensor
(
arm_compute
::
ITensor
*
parent
,
arm_compute
::
TensorShape
shape
,
arm_compute
::
Coordinates
coords
)
{
auto
ptensor
=
dynamic_cast
<
ParentTensorType
*>
(
parent
);
auto
subtensor
=
cpp14
::
make_unique
<
SubTensorType
>
(
ptensor
,
shape
,
coords
);
return
std
::
move
(
subtensor
);
}
SubTensor
::
SubTensor
(
Tensor
*
parent
,
arm_compute
::
TensorShape
&
tensor_shape
,
arm_compute
::
Coordinates
&
coords
)
noexcept
:
_target
(
TargetHint
::
DONT_CARE
),
_tensor_shape
(
tensor_shape
),
_coords
(
coords
),
_parent
(
nullptr
),
_subtensor
(
nullptr
)
{
_parent
=
parent
->
tensor
();
_target
=
parent
->
target
();
instantiate_subtensor
();
}
arm_compute
::
ITensor
*
SubTensor
::
set_target
(
TargetHint
target
)
{
return
(
target
==
_target
)
?
_subtensor
.
get
()
:
nullptr
;
}
arm_compute
::
ITensor
*
SubTensor
::
tensor
()
{
return
_subtensor
.
get
();
}
const
arm_compute
::
ITensor
*
SubTensor
::
tensor
()
const
{
return
_subtensor
.
get
();
}
TargetHint
SubTensor
::
target
()
const
{
return
_target
;
}
void
SubTensor
::
allocate
()
{
// NOP for sub-tensors
}
void
SubTensor
::
instantiate_subtensor
()
{
switch
(
_target
)
{
#ifdef USE_OPENCL
case
TargetHint
::
OPENCL
:
_subtensor
=
initialise_subtensor
<
arm_compute
::
CLSubTensor
,
arm_compute
::
ICLTensor
>
(
_parent
,
_tensor_shape
,
_coords
);
break
;
#endif
default:
case
TargetHint
::
NEON
:
_subtensor
=
initialise_subtensor
<
arm_compute
::
SubTensor
,
arm_compute
::
ITensor
>
(
_parent
,
_tensor_shape
,
_coords
);
break
;
}
}
#endif
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
src/operators/kernel/mali/acl_tensor.h
0 → 100644
浏览文件 @
bec8987d
/*!
* Copyright (c) 2016 by Contributors
* \file acl_tensor.h
* \brief
* \author Joey
*/
#ifndef ACL_TENSOR_H_
#define ACL_TENSOR_H_
#ifdef USE_ACL
#ifdef USE_OPENCL
#include "arm_compute/runtime/CL/CLSubTensor.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#elif defined(USE_OPENGLES)
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#endif
#include "arm_compute/runtime/SubTensor.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace
paddle_mobile
{
namespace
operators
{
namespace
acl
{
enum
class
TargetHint
{
DONT_CARE
,
OPENCL
,
OPENGLES
,
NEON
,
};
enum
class
ConvolutionMethodHint
{
GEMM
,
DIRECT
,
};
namespace
cpp14
{
template
<
class
T
>
struct
_Unique_if
{
typedef
std
::
unique_ptr
<
T
>
_Single_object
;
};
template
<
class
T
>
struct
_Unique_if
<
T
[]
>
{
typedef
std
::
unique_ptr
<
T
[]
>
_Unknown_bound
;
};
template
<
class
T
,
size_t
N
>
struct
_Unique_if
<
T
[
N
]
>
{
typedef
void
_Known_bound
;
};
template
<
class
T
,
class
...
Args
>
typename
_Unique_if
<
T
>::
_Single_object
make_unique
(
Args
&&
...
args
)
{
return
std
::
unique_ptr
<
T
>
(
new
T
(
std
::
forward
<
Args
>
(
args
)...));
}
template
<
class
T
>
typename
_Unique_if
<
T
>::
_Unknown_bound
make_unique
(
size_t
n
)
{
typedef
typename
std
::
remove_extent
<
T
>::
type
U
;
return
std
::
unique_ptr
<
T
>
(
new
U
[
n
]());
}
template
<
class
T
,
class
...
Args
>
typename
_Unique_if
<
T
>::
_Known_bound
make_unique
(
Args
&&
...);
}
// namespace cpp14
class
Tensor
{
public:
explicit
Tensor
(
arm_compute
::
TensorInfo
&
info
)
noexcept
;
virtual
~
Tensor
()
{}
Tensor
(
Tensor
&&
src
)
noexcept
;
void
set_info
(
arm_compute
::
TensorInfo
&&
info
)
{
_info
=
info
;
}
arm_compute
::
ITensor
*
set_target
(
TargetHint
target
);
const
arm_compute
::
TensorInfo
&
info
()
const
{
return
_info
;
}
arm_compute
::
ITensor
*
tensor
()
{
return
_tensor
.
get
();
}
void
allocate
();
void
init
()
{}
TargetHint
target
()
const
{
return
_target
;
}
virtual
void
map
(
bool
blocking
=
true
);
virtual
void
unmap
();
private:
TargetHint
_target
;
arm_compute
::
TensorInfo
_info
;
std
::
unique_ptr
<
arm_compute
::
ITensor
>
_tensor
;
};
class
SubTensor
{
public:
SubTensor
(
Tensor
*
parent
,
arm_compute
::
TensorShape
&
tensor_shape
,
arm_compute
::
Coordinates
&
coords
)
noexcept
;
~
SubTensor
()
{}
arm_compute
::
ITensor
*
tensor
();
const
arm_compute
::
ITensor
*
tensor
()
const
;
TargetHint
target
()
const
;
void
allocate
();
arm_compute
::
ITensor
*
set_target
(
TargetHint
target
);
private:
/** Instantiates a sub-tensor */
void
instantiate_subtensor
();
private:
/**< Target that this tensor is pinned on */
TargetHint
_target
;
/**< SubTensor shape */
arm_compute
::
TensorShape
_tensor_shape
;
/**< SubTensor Coordinates */
arm_compute
::
Coordinates
_coords
;
/**< Parent tensor */
arm_compute
::
ITensor
*
_parent
;
/**< SubTensor */
std
::
unique_ptr
<
arm_compute
::
ITensor
>
_subtensor
;
};
}
// namespace acl
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif // ACL_TENSOR_H_
src/operators/kernel/mali/batchnorm_kernel.cpp
浏览文件 @
bec8987d
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -17,20 +17,152 @@ limitations under the License. */
#pragma once
#include "operators/kernel/batchnorm_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclBatchNormOp
:
public
acl
::
ACLOperator
{
public:
AclBatchNormOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_BN
;
}
~
AclBatchNormOp
()
=
default
;
AclBatchNormOp
(
const
AclBatchNormOp
&
)
=
delete
;
AclBatchNormOp
&
operator
=
(
const
AclBatchNormOp
&
)
=
delete
;
AclBatchNormOp
(
AclBatchNormOp
&&
)
=
delete
;
AclBatchNormOp
&
operator
=
(
AclBatchNormOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
BatchNormParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
arm_compute
::
TensorShape
mean_shape
(
args
.
in_depth
);
arm_compute
::
TensorShape
var_shape
=
mean_shape
;
arm_compute
::
TensorShape
beta_shape
=
mean_shape
;
arm_compute
::
TensorShape
gamma_shape
=
mean_shape
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
new_tensor
(
mean
(),
mean_shape
,
args
.
mean_data
);
new_tensor
(
var
(),
var_shape
,
args
.
var_data
);
new_tensor
(
beta
(),
beta_shape
,
args
.
biases_data
);
new_tensor
(
gamma
(),
gamma_shape
,
args
.
weight_data
);
acl_configure
(
bn
,
this
,
args
.
epsilon
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
BatchNormParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
BatchNormParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
InputX
();
Tensor
*
out
=
param
.
OutputY
();
const
Tensor
*
scale
=
param
.
InputScale
();
const
Tensor
*
bias
=
param
.
InputBias
();
const
Tensor
*
saved_mean
=
param
.
InputMean
();
const
Tensor
*
saved_variance
=
param
.
InputVariance
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
scale
->
data
<
T
>
();
const
T
*
bias_data
=
bias
->
data
<
T
>
();
const
T
*
mean_data
=
saved_mean
->
data
<
T
>
();
const
T
*
var_data
=
saved_variance
->
data
<
T
>
();
float
epsilon
=
param
.
Epsilon
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
// args.weight_data = (void*)weight_data;
// args.biases_data = (void*)bias_data;
args
.
mean_data
=
(
void
*
)
mean_data
;
args
.
var_data
=
(
void
*
)
var_data
;
args
.
epsilon
=
epsilon
;
args
.
dim
=
in_x
->
dims
().
size
();
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
args
.
out_num
=
out
->
dims
()[
0
];
args
.
out_depth
=
out
->
dims
()[
1
];
args
.
out_rows
=
out
->
dims
()[
2
];
args
.
out_cols
=
out
->
dims
()[
3
];
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
(
void
*
)
bias_data
;
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
para
)
const
{
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
param
)
const
{
AclBatchNormOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclBatchNormOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclBatchNormOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
BatchNormKernel
<
GPU_MALI
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{}
const
BatchNormParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclBatchNormOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclBatchNormOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
BatchNormKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/concat_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConcatOp
:
public
acl
::
ACLOperator
{
public:
AclConcatOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONCAT
;
}
~
AclConcatOp
()
=
default
;
AclConcatOp
(
const
AclConcatOp
&
)
=
delete
;
AclConcatOp
&
operator
=
(
const
AclConcatOp
&
)
=
delete
;
AclConcatOp
(
AclConcatOp
&&
)
=
delete
;
AclConcatOp
&
operator
=
(
AclConcatOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ConcatParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
const
std
::
vector
<
framework
::
LoDTensor
*>*
input_data
=
&
args
.
in_tensor
;
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
batch
);
if
(
is_operator_init_done
(
output_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
T
type
;
for
(
int
i
=
0
;
i
<
input_data
->
size
();
i
++
)
{
const
T
*
idata
=
(
*
input_data
)[
i
]
->
data
<
T
>
();
const
T
*
pdata
=
(
*
input_data
)[
i
]
->
data
<
T
>
();
int
in_batch
=
(
*
input_data
)[
i
]
->
dims
()[
0
];
int
in_channels
=
(
*
input_data
)[
i
]
->
dims
()[
1
];
int
in_width
=
(
*
input_data
)[
i
]
->
dims
()[
2
];
int
in_height
=
(
*
input_data
)[
i
]
->
dims
()[
3
];
arm_compute
::
TensorShape
in_shape
(
in_width
,
in_height
,
in_channels
);
new_tensor
(
cinput
(
i
),
in_shape
,
acl
::
InputdataPtr
(
this
,
args
.
in_tensor
,
type
,
i
));
}
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
concat
,
this
,
input_data
->
size
());
}
void
RunAcl
(
const
std
::
vector
<
framework
::
LoDTensor
*>&
input
,
void
*
output
)
{
T
type
;
acl
::
acl_run
(
this
,
input
,
output
,
type
);
}
bool
Bypass_acl
(
const
ConcatParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
!
args
.
is_channel_concat
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
ConcatParam
&
param
)
{
auto
inputs
=
param
.
Inputs
();
auto
*
output
=
param
.
Out
();
int64_t
axis
=
param
.
Axis
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
args
.
is_channel_concat
=
(
axis
==
1
);
args
.
in_tensor
=
inputs
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
batch
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConcatKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConcatParam
&
param
)
const
{
AclConcatOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConcatOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConcatOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConcatKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConcatParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConcatOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConcatOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
std
::
vector
<
framework
::
LoDTensor
*>
temp_data
=
args
.
in_tensor
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
(
temp_data
,
(
void
*
)
output_data
);
}
template
class
ConcatKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/conv_add_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConvAddOp
:
public
acl
::
ACLOperator
{
public:
AclConvAddOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONV
;
}
~
AclConvAddOp
()
=
default
;
AclConvAddOp
(
const
AclConvAddOp
&
)
=
delete
;
AclConvAddOp
&
operator
=
(
const
AclConvAddOp
&
)
=
delete
;
AclConvAddOp
(
AclConvAddOp
&&
)
=
delete
;
AclConvAddOp
&
operator
=
(
AclConvAddOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
FusionConvAddParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
arm_compute
::
TensorShape
weights_shape
(
args
.
filter_cols
,
args
.
filter_rows
,
args
.
in_depth
/
args
.
num_group
,
args
.
out_depth
);
arm_compute
::
TensorShape
biases_shape
(
args
.
out_depth
);
arm_compute
::
PadStrideInfo
conv_info
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
FLOOR
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
check_direct_conv
();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor
(
weights
(),
weights_shape
,
args
.
weight_data
);
//[OFM]
if
(
args
.
biases_data
)
{
new_tensor
(
biases
(),
biases_shape
,
args
.
biases_data
);
}
group
()
=
args
.
num_group
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
conv
,
this
,
conv_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
FusionConvAddParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
args
.
num_group
>=
5
)
{
bypass_acl
=
true
;
}
if
(
args
.
dim
>
2
)
{
bypass_acl
=
true
;
}
if
(
args
.
dilated
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
check_direct_conv
()
{
bool
use_direct_conv
=
false
;
const
char
*
pDirectConv
;
pDirectConv
=
getenv
(
"DIRECTCONV"
);
if
(
pDirectConv
)
{
unsigned
int
bdirectconv
;
sscanf
(
pDirectConv
,
"%i"
,
&
bdirectconv
);
if
(
bdirectconv
!=
use_direct_conv
)
{
use_direct_conv
=
bdirectconv
;
printf
(
"DIRECTCONV<%s>
\n
"
,
pDirectConv
);
printf
(
"DIRECTCONV: %x
\n
"
,
use_direct_conv
);
}
}
int
pad_data
[
2
],
kernel
[
2
];
pad_data
[
1
]
=
args
.
pad_rows
;
pad_data
[
0
]
=
args
.
pad_cols
;
kernel
[
1
]
=
args
.
filter_rows
;
kernel
[
0
]
=
args
.
filter_cols
;
if
(
use_direct_conv
&&
((
kernel
[
0
]
==
1
&&
kernel
[
1
]
==
1
&&
pad_data
[
0
]
==
0
&&
pad_data
[
1
]
==
0
)
||
(
kernel
[
0
]
==
3
&&
kernel
[
1
]
==
3
&&
pad_data
[
0
]
<=
1
&&
pad_data
[
1
]
<=
1
)))
{
setConvMethod
();
// NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void
AclParametersByContext
(
const
FusionConvAddParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
bias
;
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
filter
.
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
nullptr
;
try
{
bias
=
param
.
Bias
();
}
catch
(
const
std
::
exception
&
e
)
{
}
if
(
bias
)
{
const
T
*
biases_data
=
bias
->
data
<
T
>
();
args
.
biases_data
=
(
void
*
)
biases_data
;
}
args
.
num_group
=
groups
;
args
.
dilation_rows
=
dilations
[
0
];
args
.
dilation_cols
=
dilations
[
1
];
if
(
dilations
[
0
]
!=
1
||
dilations
[
1
]
!=
1
)
{
args
.
dilated
=
true
;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args
.
batch
=
input
->
dims
()[
0
];
args
.
in_depth
=
input
->
dims
()[
1
];
args
.
in_rows
=
input
->
dims
()[
2
];
args
.
in_cols
=
input
->
dims
()[
3
];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args
.
out_num
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args
.
filter_rows
=
filter
.
dims
()[
2
];
args
.
filter_cols
=
filter
.
dims
()[
3
];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConvAddKernel
<
GPU_MALI
,
float
>::
Init
(
const
FusionConvAddParam
&
param
)
const
{
AclConvAddOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvAddOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConvAddOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConvAddKernel
<
GPU_MALI
,
float
>::
Compute
(
const
FusionConvAddParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConvAddOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvAddOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ConvAddKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/conv_kernel.cpp
浏览文件 @
bec8987d
...
...
@@ -15,20 +15,213 @@ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclConvOp
:
public
acl
::
ACLOperator
{
public:
AclConvOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_CONV
;
}
~
AclConvOp
()
=
default
;
AclConvOp
(
const
AclConvOp
&
)
=
delete
;
AclConvOp
&
operator
=
(
const
AclConvOp
&
)
=
delete
;
AclConvOp
(
AclConvOp
&&
)
=
delete
;
AclConvOp
&
operator
=
(
AclConvOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ConvParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
,
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
,
args
.
out_num
);
arm_compute
::
TensorShape
weights_shape
(
args
.
filter_cols
,
args
.
filter_rows
,
args
.
in_depth
/
args
.
num_group
,
args
.
out_depth
);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
PadStrideInfo
conv_info
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
FLOOR
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
check_direct_conv
();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor
(
weights
(),
weights_shape
,
args
.
weight_data
);
//[OFM]
// if (args.biases_data) {
// new_tensor(biases(),biases_shape,args.biases_data);
//}
group
()
=
args
.
num_group
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
conv
,
this
,
conv_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
ConvParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
||
args
.
num_group
>=
5
)
{
bypass_acl
=
true
;
}
if
(
args
.
dim
>
2
)
{
bypass_acl
=
true
;
}
if
(
args
.
dilated
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
check_direct_conv
()
{
bool
use_direct_conv
=
false
;
const
char
*
pDirectConv
;
pDirectConv
=
getenv
(
"DIRECTCONV"
);
if
(
pDirectConv
)
{
unsigned
int
bdirectconv
;
sscanf
(
pDirectConv
,
"%i"
,
&
bdirectconv
);
if
(
bdirectconv
!=
use_direct_conv
)
{
use_direct_conv
=
bdirectconv
;
printf
(
"DIRECTCONV<%s>
\n
"
,
pDirectConv
);
printf
(
"DIRECTCONV: %x
\n
"
,
use_direct_conv
);
}
}
int
pad_data
[
2
],
kernel
[
2
];
pad_data
[
1
]
=
args
.
pad_rows
;
pad_data
[
0
]
=
args
.
pad_cols
;
kernel
[
1
]
=
args
.
filter_rows
;
kernel
[
0
]
=
args
.
filter_cols
;
if
(
use_direct_conv
&&
((
kernel
[
0
]
==
1
&&
kernel
[
1
]
==
1
&&
pad_data
[
0
]
==
0
&&
pad_data
[
1
]
==
0
)
||
(
kernel
[
0
]
==
3
&&
kernel
[
1
]
==
3
&&
pad_data
[
0
]
<=
1
&&
pad_data
[
1
]
<=
1
)))
{
setConvMethod
();
// NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void
AclParametersByContext
(
const
ConvParam
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
T
*
weight_data
=
filter
.
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
weight_data
=
(
void
*
)
weight_data
;
args
.
biases_data
=
nullptr
;
// try {
// bias = context.Input<framework::Tensor>("Bias");
// } catch (const std::exception& e) {
// }
// if (bias) {
// const T* biases_data = bias->data<T>();
// args.biases_data = (void*)biases_data;
// }
args
.
num_group
=
groups
;
args
.
dilation_rows
=
dilations
[
0
];
args
.
dilation_cols
=
dilations
[
1
];
if
(
dilations
[
0
]
!=
1
||
dilations
[
1
]
!=
1
)
{
args
.
dilated
=
true
;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args
.
batch
=
input
->
dims
()[
0
];
args
.
in_depth
=
input
->
dims
()[
1
];
args
.
in_rows
=
input
->
dims
()[
2
];
args
.
in_cols
=
input
->
dims
()[
3
];
std
::
cout
<<
"In N: "
<<
args
.
batch
<<
" C: "
<<
args
.
in_depth
<<
" H: "
<<
args
.
in_rows
<<
" W: "
<<
args
.
in_cols
<<
"
\n
"
;
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args
.
out_num
=
output
->
dims
()[
0
];
args
.
out_depth
=
output
->
dims
()[
1
];
args
.
out_rows
=
output
->
dims
()[
2
];
args
.
out_cols
=
output
->
dims
()[
3
];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args
.
filter_rows
=
filter
.
dims
()[
2
];
args
.
filter_cols
=
filter
.
dims
()[
3
];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
param
)
const
{
AclConvOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclConvOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
// ArmConvImplement imp;
// imp.Compute(param);
param
.
Output
()
->
mutable_data
<
float
>
()[
0
]
=
100.0
;
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclConvOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclConvOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ConvKernel
<
GPU_MALI
,
float
>;
...
...
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/elementwise_add_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once
#include "operators/kernel/elementwise_add_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
T
>
struct
AddFunctor
{
inline
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
>
bool
ElementwiseAddKernel
<
GPU_MALI
,
float
>::
Init
(
const
ElementwiseAddParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ElementwiseAddKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ElementwiseAddParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
Tensor
*
Out
=
param
.
Out
();
Out
->
mutable_data
<
float
>
();
int
axis
=
param
.
Axis
();
ElementwiseComputeEx
<
AddFunctor
<
float
>
,
float
>
(
input_x
,
input_y
,
axis
,
AddFunctor
<
float
>
(),
Out
);
}
template
class
ElementwiseAddKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/fushion_fc_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
FusionFcKernel
<
GPU_MALI
,
float
>::
Init
(
const
FusionFcParam
&
para
)
const
{
return
true
;
}
template
<
>
void
FusionFcKernel
<
GPU_MALI
,
float
>::
Compute
(
const
FusionFcParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
const
Tensor
*
input_z
=
param
.
InputZ
();
auto
*
input_z_data
=
input_z
->
data
<
float
>
();
int
axis
=
param
.
Axis
();
Tensor
*
out
=
param
.
Out
();
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
:
*
input_x
;
const
Tensor
y_matrix
=
input_y
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_y
,
param
.
YNumColDims
())
:
*
input_y
;
auto
out_dim
=
out
->
dims
();
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
PADDLE_MOBILE_ENFORCE
(
input_z
->
dims
().
size
()
==
1
,
"inpu_z size must be 1"
);
PADDLE_MOBILE_ENFORCE
(
out_dim
[
1
]
==
input_z
->
dims
()[
0
],
" out_dim.size must be 2."
);
axis
=
(
axis
==
-
1
?
out_dim
.
size
()
-
input_z
->
dims
().
size
()
:
axis
);
PADDLE_MOBILE_ENFORCE
(
axis
==
1
,
" to fit broadcast, axis = 1. "
)
int64_t
classes
=
input_z
->
numel
();
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
DLOG
<<
out_data
[
i
];
}
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
));
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/lrn_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#pragma once
#include "operators/kernel/lrn_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclLrnOp
:
public
acl
::
ACLOperator
{
public:
AclLrnOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_LRN
;
}
~
AclLrnOp
()
=
default
;
AclLrnOp
(
const
AclLrnOp
&
)
=
delete
;
AclLrnOp
&
operator
=
(
const
AclLrnOp
&
)
=
delete
;
AclLrnOp
(
AclLrnOp
&&
)
=
delete
;
AclLrnOp
&
operator
=
(
AclLrnOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
LrnParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
);
if
(
is_operator_init_done
(
shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
arm_compute
::
NormalizationLayerInfo
norm_info
(
arm_compute
::
NormType
::
CROSS_MAP
,
args
.
nsize
,
args
.
alpha
,
args
.
beta
,
args
.
knorm
);
//[width, height, IFM]
new_tensor
(
input
(),
shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
shape
,
args
.
output_data
);
acl_configure
(
lrn
,
this
,
norm_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
LrnParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
LrnParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
InputX
();
Tensor
*
out
=
param
.
Out
();
int
n
=
param
.
N
();
T
alpha
=
param
.
Alpha
();
T
beta
=
param
.
Beta
();
T
k
=
param
.
K
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
nsize
=
n
;
args
.
alpha
=
alpha
;
args
.
beta
=
beta
;
args
.
knorm
=
k
;
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
LrnKernel
<
GPU_MALI
,
float
>::
Init
(
const
LrnParam
&
param
)
const
{
AclLrnOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclLrnOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclLrnOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
LrnKernel
<
GPU_MALI
,
float
>::
Compute
(
const
LrnParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclLrnOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclLrnOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
batch
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
output_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
}
}
template
class
LrnKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/mul_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#pragma once
#include "operators/kernel/mul_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
MulKernel
<
GPU_MALI
,
float
>::
Init
(
const
MulParam
&
para
)
const
{
return
true
;
}
template
<
>
void
MulKernel
<
GPU_MALI
,
float
>::
Compute
(
const
MulParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
Tensor
*
out
=
param
.
Out
();
out
->
mutable_data
<
float
>
();
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
:
*
input_x
;
const
Tensor
y_matrix
=
input_y
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_y
,
param
.
YNumColDims
())
:
*
input_y
;
auto
out_dim
=
out
->
dims
();
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
0
));
if
(
out_dim
.
size
()
!=
2
)
{
out
->
Resize
(
out_dim
);
}
}
template
class
MulKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/pool_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "operators/kernel/pool_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclPoolOp
:
public
acl
::
ACLOperator
{
public:
AclPoolOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_POOLING
;
}
~
AclPoolOp
()
=
default
;
AclPoolOp
(
const
AclPoolOp
&
)
=
delete
;
AclPoolOp
&
operator
=
(
const
AclPoolOp
&
)
=
delete
;
AclPoolOp
(
AclPoolOp
&&
)
=
delete
;
AclPoolOp
&
operator
=
(
AclPoolOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
PoolParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
,
args
.
in_rows
,
args
.
in_depth
);
arm_compute
::
TensorShape
output_shape
(
args
.
out_cols
,
args
.
out_rows
,
args
.
out_depth
);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
PoolingLayerInfo
pool_info
;
if
(
args
.
pool_type
==
"max"
)
{
pool_info
=
arm_compute
::
PoolingLayerInfo
(
arm_compute
::
PoolingType
::
MAX
,
args
.
filter_rows
,
arm_compute
::
PadStrideInfo
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
CEIL
));
}
else
{
pool_info
=
arm_compute
::
PoolingLayerInfo
(
arm_compute
::
PoolingType
::
AVG
,
args
.
filter_rows
,
arm_compute
::
PadStrideInfo
(
args
.
stride_cols
,
args
.
stride_rows
,
args
.
pad_cols
,
args
.
pad_rows
,
arm_compute
::
DimensionRoundingType
::
CEIL
));
}
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
pooling
,
this
,
pool_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
PoolParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
if
(
args
.
pool_type
!=
"max"
&&
args
.
pool_type
!=
"avg"
)
{
bypass_acl
=
true
;
}
if
(
args
.
filter_rows
!=
args
.
filter_cols
)
{
bypass_acl
=
true
;
}
// if (args.filter_rows!=2 && args.filter_rows!=3) {
// bypass_acl = true;
// }
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
PoolParam
&
param
)
{
const
Tensor
*
in_x
=
param
.
Input
();
Tensor
*
out
=
param
.
Output
();
std
::
string
pooling_type
=
param
.
PoolingType
();
std
::
vector
<
int
>
ksize
=
param
.
Ksize
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
bool
is_global_pooling
=
param
.
isGlobalPooling
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
is_global_pool
=
is_global_pooling
;
args
.
pool_type
=
pooling_type
;
args
.
filter_rows
=
ksize
[
0
];
args
.
filter_cols
=
ksize
[
1
];
args
.
dim
=
ksize
.
size
();
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
in_rows
=
in_x
->
dims
()[
2
];
args
.
in_cols
=
in_x
->
dims
()[
3
];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
// std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
// << " I: " << static_cast<int>(filter->dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args
.
stride_rows
=
strides
[
0
];
args
.
stride_cols
=
strides
[
1
];
// std::cout <<"PoolingType: " << args.pool_type << "\n";
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args
.
pad_rows
=
paddings
[
0
];
args
.
pad_cols
=
paddings
[
1
];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
args
.
out_depth
=
args
.
in_depth
;
// args.out_rows = out->dims()[2];
// args.out_cols = out->dims()[3];
args
.
out_rows
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
args
.
in_rows
+
2
*
args
.
pad_rows
-
args
.
filter_rows
)
/
args
.
stride_rows
))
+
1
;
args
.
out_cols
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
args
.
in_cols
+
2
*
args
.
pad_cols
-
args
.
filter_cols
)
/
args
.
stride_cols
))
+
1
;
if
(
is_global_pooling
)
{
args
.
filter_rows
=
args
.
in_rows
;
args
.
filter_cols
=
args
.
in_cols
;
args
.
pad_rows
=
0
;
args
.
pad_cols
=
0
;
}
}
acl
::
AclParameters
args
;
};
template
<
>
bool
PoolKernel
<
GPU_MALI
,
float
>::
Init
(
const
PoolParam
&
param
)
const
{
AclPoolOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclPoolOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclPoolOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
PoolKernel
<
GPU_MALI
,
float
>::
Compute
(
const
PoolParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclPoolOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclPoolOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
batch
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
*
args
.
in_cols
*
args
.
in_rows
;
output_data
+=
args
.
in_depth
*
args
.
out_cols
*
args
.
out_rows
;
}
}
template
class
PoolKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/relu_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include "operators/kernel/relu_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclReluOp
:
public
acl
::
ACLOperator
{
public:
AclReluOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_RELU
;
}
~
AclReluOp
()
=
default
;
AclReluOp
(
const
AclReluOp
&
)
=
delete
;
AclReluOp
&
operator
=
(
const
AclReluOp
&
)
=
delete
;
AclReluOp
(
AclReluOp
&&
)
=
delete
;
AclReluOp
&
operator
=
(
AclReluOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
ReluParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
input_shape
(
args
.
in_cols
*
args
.
in_rows
*
args
.
in_depth
*
args
.
batch
);
arm_compute
::
TensorShape
output_shape
(
args
.
in_cols
*
args
.
in_rows
*
args
.
in_depth
*
args
.
out_num
);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute
::
ActivationLayerInfo
::
ActivationFunction
type
;
type
=
arm_compute
::
ActivationLayerInfo
::
ActivationFunction
::
RELU
;
arm_compute
::
ActivationLayerInfo
act_info
(
type
);
if
(
is_operator_init_done
(
input_shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
input_shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
output_shape
,
args
.
output_data
);
acl_configure
(
activation
,
this
,
act_info
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
ReluParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
ReluParam
&
param
)
{
const
auto
*
input_x
=
param
.
InputX
();
auto
*
out
=
param
.
Out
();
const
T
*
input_data
=
input_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
args
.
batch
=
input_x
->
dims
()[
0
];
args
.
in_depth
=
input_x
->
dims
()[
1
];
args
.
in_rows
=
input_x
->
dims
()[
2
];
args
.
in_cols
=
input_x
->
dims
()[
3
];
args
.
out_num
=
out
->
dims
()[
0
];
}
acl
::
AclParameters
args
;
};
template
<
>
bool
ReluKernel
<
GPU_MALI
,
float
>::
Init
(
const
ReluParam
&
param
)
const
{
AclReluOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclReluOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclReluOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
ReluKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ReluParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclReluOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclReluOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
}
template
class
ReluKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/kernel/mali/reshape_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#pragma once
#include "operators/kernel/reshape_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ReshapeKernel
<
GPU_MALI
,
float
>::
Init
(
const
ReshapeParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ReshapeKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ReshapeParam
&
param
)
const
{
const
auto
*
input_x
=
param
.
InputX
();
const
auto
&
input_x_dims
=
input_x
->
dims
();
auto
*
out
=
param
.
Out
();
framework
::
DDim
out_dims
=
out
->
dims
();
const
auto
*
input_shape
=
param
.
InputShape
();
if
(
input_shape
)
{
auto
*
shape_data
=
input_shape
->
data
<
int
>
();
framework
::
Tensor
cpu_shape_tensor
;
auto
shape
=
std
::
vector
<
int
>
(
shape_data
,
shape_data
+
input_shape
->
numel
());
out_dims
=
ValidateShape
(
shape
,
input_x
->
dims
());
}
bool
inplace
=
param
.
Inplace
();
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
<
float
>
();
framework
::
TensorCopy
(
*
input_x
,
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
input_x
);
out
->
Resize
(
out_dims
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/mali/softmax_kernel.cpp
0 → 100644
浏览文件 @
bec8987d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once
#include "operators/kernel/softmax_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
AclSoftmaxOp
:
public
acl
::
ACLOperator
{
public:
AclSoftmaxOp
()
{
this
->
force_bypass_acl_path_
=
bypass_acl_class_layer
&
FLAGS_ENABLE_ACL_SOFTMAX
;
}
~
AclSoftmaxOp
()
=
default
;
AclSoftmaxOp
(
const
AclSoftmaxOp
&
)
=
delete
;
AclSoftmaxOp
&
operator
=
(
const
AclSoftmaxOp
&
)
=
delete
;
AclSoftmaxOp
(
AclSoftmaxOp
&&
)
=
delete
;
AclSoftmaxOp
&
operator
=
(
AclSoftmaxOp
&&
)
=
delete
;
acl
::
AclParameters
&
getargs
()
{
return
args
;
}
void
InitAclLayer
(
const
SoftmaxParam
&
param
)
{
setTargetHint
(
acl
::
TargetHint
::
OPENCL
);
arm_compute
::
TensorShape
shape
(
args
.
in_depth
,
args
.
batch
);
if
(
is_operator_init_done
(
shape
))
return
;
set_operator_init_done
();
this
->
force_bypass_acl_path_
=
false
;
//[width, height, IFM]
new_tensor
(
input
(),
shape
,
args
.
input_data
);
//[width, height, OFM]
new_tensor
(
output
(),
shape
,
args
.
output_data
);
acl_configure
(
softmax
,
this
,
NULL
);
}
void
RunAcl
(
void
*
input
,
void
*
output
)
{
acl
::
ACLOperator
::
acl_run
(
input
,
output
);
}
bool
Bypass_acl
(
const
SoftmaxParam
&
param
)
{
bool
bypass_acl
=
false
;
AclParametersByContext
(
param
);
// for performance, more groups impact GPU performance
if
(
this
->
force_bypass_acl_path_
)
{
bypass_acl
=
true
;
}
return
bypass_acl
;
}
private:
void
AclParametersByContext
(
const
SoftmaxParam
&
param
)
{
const
framework
::
Tensor
*
in_x
=
param
.
InputX
();
framework
::
Tensor
*
out
=
param
.
Out
();
auto
x_dims
=
in_x
->
dims
();
out
->
Resize
(
x_dims
);
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
data
<
T
>
();
args
.
input_data
=
(
void
*
)
input_data
;
args
.
output_data
=
(
void
*
)
output_data
;
// NCHW
args
.
batch
=
in_x
->
dims
()[
0
];
args
.
in_depth
=
in_x
->
dims
()[
1
];
args
.
out_num
=
out
->
dims
()[
0
];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl
::
AclParameters
args
;
};
template
<
>
bool
SoftmaxKernel
<
GPU_MALI
,
float
>::
Init
(
const
SoftmaxParam
&
param
)
const
{
AclSoftmaxOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclSoftmaxOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
acl_op
=
new
AclSoftmaxOp
<
GPU_MALI
,
float
>
();
this
->
SetAclOp
((
void
*
)
acl_op
,
(
void
*
)
this
);
}
return
true
;
}
template
<
>
void
SoftmaxKernel
<
GPU_MALI
,
float
>::
Compute
(
const
SoftmaxParam
&
param
)
const
{
std
::
cout
<<
"init acl"
<<
std
::
endl
;
AclSoftmaxOp
<
GPU_MALI
,
float
>*
acl_op
=
reinterpret_cast
<
AclSoftmaxOp
<
GPU_MALI
,
float
>*>
(
this
->
GetAclOp
());
if
(
acl_op
==
nullptr
)
{
return
;
}
if
(
acl_op
->
Bypass_acl
(
param
))
{
std
::
cout
<<
"init acl failed"
<<
std
::
endl
;
return
;
}
acl
::
AclParameters
&
args
=
acl_op
->
getargs
();
const
float
*
input_data
=
(
const
float
*
)
args
.
input_data
;
const
float
*
output_data
=
(
const
float
*
)
args
.
output_data
;
acl_op
->
InitAclLayer
(
param
);
for
(
int
n
=
0
;
n
<
args
.
out_num
;
++
n
)
{
acl_op
->
RunAcl
((
void
*
)
input_data
,
(
void
*
)
output_data
);
input_data
+=
args
.
in_depth
;
output_data
+=
args
.
in_depth
;
}
}
template
class
SoftmaxKernel
<
GPU_MALI
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
src/operators/lrn_op.cpp
浏览文件 @
bec8987d
...
...
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
REGISTER_OPERATOR_CPU
(
lrn
,
ops
::
LrnOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
lrn
);
REGISTER_OPERATOR_MALI_GPU
(
lrn
,
ops
::
LrnOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/mul_op.cpp
浏览文件 @
bec8987d
...
...
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
REGISTER_OPERATOR_CPU
(
mul
,
ops
::
MulOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
mul
);
REGISTER_OPERATOR_MALI_GPU
(
mul
,
ops
::
MulOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/pool_op.cpp
浏览文件 @
bec8987d
...
...
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
REGISTER_OPERATOR_CPU
(
pool2d
,
ops
::
PoolOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
pool2d
);
REGISTER_OPERATOR_MALI_GPU
(
pool2d
,
ops
::
PoolOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/relu_op.cpp
浏览文件 @
bec8987d
...
...
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
REGISTER_OPERATOR_CPU
(
relu
,
ops
::
ReluOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
relu
);
REGISTER_OPERATOR_MALI_GPU
(
relu
,
ops
::
ReluOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/reshape_op.cpp
浏览文件 @
bec8987d
...
...
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
REGISTER_OPERATOR_CPU
(
reshape
,
ops
::
ReshapeOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
reshape
);
REGISTER_OPERATOR_MALI_GPU
(
reshape
,
ops
::
ReshapeOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
src/operators/softmax_op.cpp
浏览文件 @
bec8987d
...
...
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
REGISTER_OPERATOR_CPU
(
softmax
,
ops
::
SoftmaxOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU
(
softmax
);
REGISTER_OPERATOR_MALI_GPU
(
softmax
,
ops
::
SoftmaxOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
...
...
tools/build.sh
浏览文件 @
bec8987d
...
...
@@ -15,6 +15,7 @@ build_for_mac() {
fi
PLATFORM
=
"x86"
MODE
=
"Release"
CXX_FLAGS
=
"-std=c++11 -O3 -s"
BUILD_DIR
=
../build/release/
"
${
PLATFORM
}
"
mkdir
-p
${
BUILD_DIR
}
/build
...
...
@@ -24,6 +25,7 @@ build_for_mac() {
cmake ..
\
-B
"
${
BUILD_DIR
}
"
\
-DCMAKE_BUILD_TYPE
=
"
${
MODE
}
"
\
-DCMAKE_CXX_FLAGS
=
"
${
CXX_FLAGS
}
"
\
-DIS_MAC
=
true
cd
${
BUILD_DIR
}
...
...
@@ -31,6 +33,8 @@ build_for_mac() {
}
build_for_android
()
{
export
ANDROID_NDK
=
/home/halsay/android-ndk-r16b
export
NDK_ROOT
=
/home/halsay/android-ndk-r16b
rm
-rf
"../build"
if
[
-z
"
${
ANDROID_NDK
}
"
]
;
then
echo
"ANDROID_NDK not found!"
...
...
@@ -44,11 +48,11 @@ build_for_android() {
if
[
"
${
PLATFORM
}
"
=
"arm-v7a"
]
;
then
ABI
=
"armeabi-v7a with NEON"
ARM_PLATFORM
=
"V7"
CXX_FLAGS
=
"-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
CXX_FLAGS
=
"-
O3 -std=c++11 -s -
march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
elif
[
"
${
PLATFORM
}
"
=
"arm-v8a"
]
;
then
ABI
=
"arm64-v8a"
ARM_PLATFORM
=
"V8"
CXX_FLAGS
=
"-
march=armv8-a -pie -fPIE -w -Wno-error=format-security -llo
g"
CXX_FLAGS
=
"-
std=c++11 -march=armv8-a -pie -fPIE -w -Wno-error=format-security -llog -O0 -ggdb3 -fno-inline -
g"
else
echo
"unknown platform!"
exit
-1
...
...
@@ -56,7 +60,7 @@ build_for_android() {
MODE
=
"Release"
ANDROID_PLATFORM_VERSION
=
"android-
15
"
ANDROID_PLATFORM_VERSION
=
"android-
22
"
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE
=
"arm"
if
[
$#
-eq
1
]
;
then
...
...
@@ -171,4 +175,4 @@ else
build_error
fi
fi
fi
fi
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录