Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
ba7458fa
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ba7458fa
编写于
3月 18, 2019
作者:
R
Ray Liu
提交者:
GitHub
3月 18, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into backup
上级
60fd1c83
edd7f241
变更
21
显示空白变更内容
内联
并排
Showing
21 changed file
with
1358 addition
and
142 deletion
+1358
-142
CMakeLists.txt
CMakeLists.txt
+1
-1
src/framework/cl/cl_engine.cpp
src/framework/cl/cl_engine.cpp
+14
-9
src/framework/cl/cl_engine.h
src/framework/cl/cl_engine.h
+21
-1
src/framework/cl/cl_scope.h
src/framework/cl/cl_scope.h
+8
-8
src/io/opencl_interface.cpp
src/io/opencl_interface.cpp
+35
-0
src/io/opencl_interface.h
src/io/opencl_interface.h
+27
-0
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+22
-23
src/operators/activation_op.cpp
src/operators/activation_op.cpp
+3
-0
src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+851
-0
src/operators/kernel/cl/cl_kernel/sigmoid.cl
src/operators/kernel/cl/cl_kernel/sigmoid.cl
+30
-0
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+165
-55
src/operators/kernel/cl/conv_add_kernel.cpp
src/operators/kernel/cl/conv_add_kernel.cpp
+101
-38
src/operators/kernel/cl/reshape_kernel.cpp
src/operators/kernel/cl/reshape_kernel.cpp
+2
-0
src/operators/kernel/cl/sigmoid_kernel.cpp
src/operators/kernel/cl/sigmoid_kernel.cpp
+46
-0
src/operators/kernel/conv_add_bn_relu_kernel.h
src/operators/kernel/conv_add_bn_relu_kernel.h
+3
-0
src/operators/kernel/conv_add_kernel.h
src/operators/kernel/conv_add_kernel.h
+3
-0
src/operators/op_param.cpp
src/operators/op_param.cpp
+5
-0
test/CMakeLists.txt
test/CMakeLists.txt
+9
-0
test/net/test_mobilenet_GPU.cpp
test/net/test_mobilenet_GPU.cpp
+5
-6
tools/build.sh
tools/build.sh
+1
-1
tools/op.cmake
tools/op.cmake
+6
-0
未找到文件。
CMakeLists.txt
浏览文件 @
ba7458fa
...
@@ -187,7 +187,7 @@ else()
...
@@ -187,7 +187,7 @@ else()
set
(
NET
"default"
CACHE STRING
"select net type"
)
set
(
NET
"default"
CACHE STRING
"select net type"
)
endif
()
endif
()
set_property
(
CACHE NET PROPERTY STRINGS
"default"
"googlenet"
"mobilenet"
"yolo"
"squeezenet"
"FPGA_NET_V1"
"FPGA_NET_V2"
"NLP"
)
set_property
(
CACHE NET PROPERTY STRINGS
"default"
"googlenet"
"mobilenet"
"yolo"
"squeezenet"
"FPGA_NET_V1"
"FPGA_NET_V2"
"NLP"
"op"
)
include
(
"
${
CMAKE_CURRENT_LIST_DIR
}
/tools/op.cmake"
)
include
(
"
${
CMAKE_CURRENT_LIST_DIR
}
/tools/op.cmake"
)
# build library
# build library
...
...
src/framework/cl/cl_engine.cpp
浏览文件 @
ba7458fa
...
@@ -27,9 +27,9 @@ bool CLEngine::Init() {
...
@@ -27,9 +27,9 @@ bool CLEngine::Init() {
return
true
;
return
true
;
}
}
cl_int
status
;
cl_int
status
;
SetPlatform
();
bool
is_setplatform_success
=
SetPlatform
();
SetClDeviceId
();
bool
is_setcldeviceid_success
=
SetClDeviceId
();
is_init_success_
=
is_setplatform_success
&&
is_setcldeviceid_success
;
initialized_
=
true
;
initialized_
=
true
;
return
initialized_
;
return
initialized_
;
// setClCommandQueue();
// setClCommandQueue();
...
@@ -44,11 +44,14 @@ CLEngine *CLEngine::Instance() {
...
@@ -44,11 +44,14 @@ CLEngine *CLEngine::Instance() {
return
&
cl_engine_
;
return
&
cl_engine_
;
}
}
bool
CLEngine
::
isInitSuccess
()
{
return
is_init_success_
;
}
bool
CLEngine
::
SetPlatform
()
{
bool
CLEngine
::
SetPlatform
()
{
platform_
=
NULL
;
// the chosen platform
platform_
=
NULL
;
// the chosen platform
cl_uint
numPlatforms
;
// the NO. of platforms
cl_uint
numPlatforms
;
// the NO. of platforms
cl_int
status
=
clGetPlatformIDs
(
0
,
NULL
,
&
numPlatforms
);
cl_int
status
=
clGetPlatformIDs
(
0
,
NULL
,
&
numPlatforms
);
if
(
status
!=
CL_SUCCESS
)
{
return
false
;
}
/**For clarity, choose the first available platform. */
/**For clarity, choose the first available platform. */
if
(
numPlatforms
>
0
)
{
if
(
numPlatforms
>
0
)
{
cl_platform_id
*
platforms
=
reinterpret_cast
<
cl_platform_id
*>
(
cl_platform_id
*
platforms
=
reinterpret_cast
<
cl_platform_id
*>
(
...
@@ -56,10 +59,10 @@ bool CLEngine::SetPlatform() {
...
@@ -56,10 +59,10 @@ bool CLEngine::SetPlatform() {
status
=
clGetPlatformIDs
(
numPlatforms
,
platforms
,
NULL
);
status
=
clGetPlatformIDs
(
numPlatforms
,
platforms
,
NULL
);
platform_
=
platforms
[
0
];
platform_
=
platforms
[
0
];
free
(
platforms
);
free
(
platforms
);
return
true
;
return
status
==
CL_SUCCESS
;
}
else
{
return
false
;
}
}
return
false
;
}
}
bool
CLEngine
::
SetClDeviceId
()
{
bool
CLEngine
::
SetClDeviceId
()
{
...
@@ -67,13 +70,15 @@ bool CLEngine::SetClDeviceId() {
...
@@ -67,13 +70,15 @@ bool CLEngine::SetClDeviceId() {
devices_
=
NULL
;
devices_
=
NULL
;
cl_int
status
=
cl_int
status
=
clGetDeviceIDs
(
platform_
,
CL_DEVICE_TYPE_GPU
,
0
,
NULL
,
&
numDevices
);
clGetDeviceIDs
(
platform_
,
CL_DEVICE_TYPE_GPU
,
0
,
NULL
,
&
numDevices
);
if
(
status
!=
CL_SUCCESS
)
{
return
false
;
}
if
(
numDevices
>
0
)
{
if
(
numDevices
>
0
)
{
devices_
=
reinterpret_cast
<
cl_device_id
*>
(
devices_
=
reinterpret_cast
<
cl_device_id
*>
(
malloc
(
numDevices
*
sizeof
(
cl_device_id
)));
malloc
(
numDevices
*
sizeof
(
cl_device_id
)));
status
=
clGetDeviceIDs
(
platform_
,
CL_DEVICE_TYPE_GPU
,
numDevices
,
devices_
,
status
=
clGetDeviceIDs
(
platform_
,
CL_DEVICE_TYPE_GPU
,
numDevices
,
devices_
,
NULL
);
NULL
);
return
true
;
return
status
==
CL_SUCCESS
;
}
}
return
false
;
return
false
;
}
}
...
...
src/framework/cl/cl_engine.h
浏览文件 @
ba7458fa
...
@@ -31,7 +31,7 @@ class CLEngine {
...
@@ -31,7 +31,7 @@ class CLEngine {
static
CLEngine
*
Instance
();
static
CLEngine
*
Instance
();
bool
Init
();
bool
Init
();
bool
isInitSuccess
();
std
::
unique_ptr
<
_cl_context
,
CLContextDeleter
>
CreateContext
()
{
std
::
unique_ptr
<
_cl_context
,
CLContextDeleter
>
CreateContext
()
{
cl_int
status
;
cl_int
status
;
cl_context
c
=
clCreateContext
(
NULL
,
1
,
devices_
,
NULL
,
NULL
,
&
status
);
cl_context
c
=
clCreateContext
(
NULL
,
1
,
devices_
,
NULL
,
NULL
,
&
status
);
...
@@ -51,6 +51,20 @@ class CLEngine {
...
@@ -51,6 +51,20 @@ class CLEngine {
return
std
::
move
(
command_queue_ptr
);
return
std
::
move
(
command_queue_ptr
);
}
}
cl_context
getContext
()
{
if
(
context_
==
nullptr
)
{
context_
=
CreateContext
();
}
return
context_
.
get
();
}
cl_command_queue
getClCommandQueue
()
{
if
(
command_queue_
==
nullptr
)
{
command_queue_
=
CreateClCommandQueue
(
getContext
());
}
return
command_queue_
.
get
();
}
std
::
unique_ptr
<
_cl_program
,
CLProgramDeleter
>
CreateProgramWith
(
std
::
unique_ptr
<
_cl_program
,
CLProgramDeleter
>
CreateProgramWith
(
cl_context
context
,
std
::
string
file_name
)
{
cl_context
context
,
std
::
string
file_name
)
{
FILE
*
file
=
fopen
(
file_name
.
c_str
(),
"rb"
);
FILE
*
file
=
fopen
(
file_name
.
c_str
(),
"rb"
);
...
@@ -137,6 +151,11 @@ class CLEngine {
...
@@ -137,6 +151,11 @@ class CLEngine {
std
::
string
cl_path_
;
std
::
string
cl_path_
;
std
::
unique_ptr
<
_cl_program
,
CLProgramDeleter
>
program_
;
std
::
unique_ptr
<
_cl_program
,
CLProgramDeleter
>
program_
;
std
::
unique_ptr
<
_cl_context
,
CLContextDeleter
>
context_
=
nullptr
;
std
::
unique_ptr
<
_cl_command_queue
,
CLCommQueueDeleter
>
command_queue_
=
nullptr
;
// bool SetClContext();
// bool SetClContext();
// bool SetClCommandQueue();
// bool SetClCommandQueue();
...
@@ -144,6 +163,7 @@ class CLEngine {
...
@@ -144,6 +163,7 @@ class CLEngine {
// bool LoadKernelFromFile(const char *kernel_file);
// bool LoadKernelFromFile(const char *kernel_file);
// bool BuildProgram();
// bool BuildProgram();
bool
is_init_success_
=
false
;
};
};
}
// namespace framework
}
// namespace framework
...
...
src/framework/cl/cl_scope.h
浏览文件 @
ba7458fa
...
@@ -29,12 +29,12 @@ namespace framework {
...
@@ -29,12 +29,12 @@ namespace framework {
class
CLScope
{
class
CLScope
{
public:
public:
CLScope
()
{
CLScope
()
{
CLEngine
*
engin
=
CLEngine
::
Instance
();
CLEngine
*
engin
e
=
CLEngine
::
Instance
();
context_
=
engin
->
Create
Context
();
context_
=
engin
e
->
get
Context
();
command_queue_
=
engin
->
CreateClCommandQueue
(
context_
.
get
()
);
command_queue_
=
engin
e
->
getClCommandQueue
(
);
}
}
cl_command_queue
CommandQueue
()
{
return
command_queue_
.
get
()
;
}
cl_command_queue
CommandQueue
()
{
return
command_queue_
;
}
std
::
unique_ptr
<
_cl_kernel
,
CLKernelDeleter
>
GetKernel
(
std
::
unique_ptr
<
_cl_kernel
,
CLKernelDeleter
>
GetKernel
(
const
std
::
string
&
kernel_name
,
const
std
::
string
&
file_name
)
{
const
std
::
string
&
kernel_name
,
const
std
::
string
&
file_name
)
{
...
@@ -49,7 +49,7 @@ class CLScope {
...
@@ -49,7 +49,7 @@ class CLScope {
return
std
::
move
(
kernel
);
return
std
::
move
(
kernel
);
}
}
cl_context
Context
()
{
return
context_
.
get
()
;
}
cl_context
Context
()
{
return
context_
;
}
cl_program
Program
(
const
std
::
string
&
file_name
)
{
cl_program
Program
(
const
std
::
string
&
file_name
)
{
auto
it
=
programs_
.
find
(
file_name
);
auto
it
=
programs_
.
find
(
file_name
);
...
@@ -58,7 +58,7 @@ class CLScope {
...
@@ -58,7 +58,7 @@ class CLScope {
}
}
auto
program
=
CLEngine
::
Instance
()
->
CreateProgramWith
(
auto
program
=
CLEngine
::
Instance
()
->
CreateProgramWith
(
context_
.
get
()
,
context_
,
CLEngine
::
Instance
()
->
GetCLPath
()
+
"/cl_kernel/"
+
file_name
);
CLEngine
::
Instance
()
->
GetCLPath
()
+
"/cl_kernel/"
+
file_name
);
DLOG
<<
" --- begin build program -> "
<<
file_name
<<
" --- "
;
DLOG
<<
" --- begin build program -> "
<<
file_name
<<
" --- "
;
...
@@ -72,8 +72,8 @@ class CLScope {
...
@@ -72,8 +72,8 @@ class CLScope {
private:
private:
cl_int
status_
;
cl_int
status_
;
std
::
unique_ptr
<
_cl_context
,
CLContextDeleter
>
context_
;
cl_context
context_
;
std
::
unique_ptr
<
_cl_command_queue
,
CLCommQueueDeleter
>
command_queue_
;
cl_command_queue
command_queue_
;
std
::
unordered_map
<
std
::
string
,
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
_cl_program
,
CLProgramDeleter
>>
std
::
unique_ptr
<
_cl_program
,
CLProgramDeleter
>>
programs_
;
programs_
;
...
...
src/io/opencl_interface.cpp
0 → 100644
浏览文件 @
ba7458fa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include "io/opencl_interface.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_scope.h"
namespace
paddle_mobile
{
cl_context
getContext
()
{
return
framework
::
CLEngine
::
Instance
()
->
getContext
();
}
cl_command_queue
getClCommandQueue
()
{
return
framework
::
CLEngine
::
Instance
()
->
getClCommandQueue
();
}
bool
isInitSuccess
()
{
return
framework
::
CLEngine
::
Instance
()
->
isInitSuccess
();
}
}
// namespace paddle_mobile
#endif
src/io/opencl_interface.h
0 → 100644
浏览文件 @
ba7458fa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_MOBILE_CL
#include "CL/cl.h"
namespace
paddle_mobile
{
cl_context
getContext
();
cl_command_queue
getClCommandQueue
();
bool
isInitSuccess
();
#endif
}
// namespace paddle_mobile
src/io/paddle_mobile.cpp
浏览文件 @
ba7458fa
...
@@ -20,6 +20,8 @@ limitations under the License. */
...
@@ -20,6 +20,8 @@ limitations under the License. */
#endif // _OPENMP
#endif // _OPENMP
#ifdef PADDLE_MOBILE_CL
#ifdef PADDLE_MOBILE_CL
#include <CL/cl.h>
#include <CL/cl.h>
#include <mutex>
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_tensor.h"
#include "framework/cl/cl_tensor.h"
#endif
#endif
#include "operators/math/gemm.h"
#include "operators/math/gemm.h"
...
@@ -202,11 +204,15 @@ double PaddleMobile<CPU, float>::GetPredictTime() {
...
@@ -202,11 +204,15 @@ double PaddleMobile<CPU, float>::GetPredictTime() {
operators
::
math
::
Gemm
gemm
;
operators
::
math
::
Gemm
gemm
;
auto
time1
=
paddle_mobile
::
time
();
auto
time1
=
paddle_mobile
::
time
();
int
times
=
4
;
for
(
int
j
=
0
;
j
<
times
;
++
j
)
{
gemm
.
Sgemm
(
m
,
n
,
k
,
static_cast
<
float
>
(
1
),
a
,
lda
,
b
,
ldb
,
gemm
.
Sgemm
(
m
,
n
,
k
,
static_cast
<
float
>
(
1
),
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
false
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
false
,
static_cast
<
float
*>
(
nullptr
));
static_cast
<
float
*>
(
nullptr
));
}
auto
time2
=
paddle_mobile
::
time
();
auto
time2
=
paddle_mobile
::
time
();
double
cost
=
paddle_mobile
::
time_diff
(
time1
,
time2
);
double
cost
=
paddle_mobile
::
time_diff
(
time1
,
time2
)
/
times
;
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c
);
...
@@ -282,21 +288,11 @@ void PaddleMobile<Device, T>::SetCLPath(std::string path) {
...
@@ -282,21 +288,11 @@ void PaddleMobile<Device, T>::SetCLPath(std::string path) {
template
<
>
template
<
>
double
PaddleMobile
<
GPU_CL
,
float
>::
GetPredictTime
()
{
double
PaddleMobile
<
GPU_CL
,
float
>::
GetPredictTime
()
{
cl_int
status
;
cl_int
status
;
cl_uint
nPlatform
;
if
(
!
framework
::
CLEngine
::
Instance
()
->
isInitSuccess
())
{
clGetPlatformIDs
(
0
,
NULL
,
&
nPlatform
);
return
-
1
;
cl_platform_id
*
listPlatform
=
reinterpret_cast
<
cl_platform_id
*>
(
}
malloc
(
nPlatform
*
sizeof
(
cl_platform_id
)));
cl_context
context
=
framework
::
CLEngine
::
Instance
()
->
getContext
();
clGetPlatformIDs
(
nPlatform
,
listPlatform
,
NULL
);
cl_command_queue
queue
=
framework
::
CLEngine
::
Instance
()
->
getClCommandQueue
();
cl_uint
nDevice
=
0
;
clGetDeviceIDs
(
listPlatform
[
0
],
CL_DEVICE_TYPE_GPU
,
0
,
NULL
,
&
nDevice
);
cl_device_id
*
listDevice
=
reinterpret_cast
<
cl_device_id
*>
(
malloc
(
nDevice
*
sizeof
(
cl_device_id
)));
clGetDeviceIDs
(
listPlatform
[
0
],
CL_DEVICE_TYPE_GPU
,
nDevice
,
listDevice
,
NULL
);
cl_context
context
=
clCreateContext
(
NULL
,
nDevice
,
listDevice
,
NULL
,
NULL
,
&
status
);
cl_command_queue
queue
=
clCreateCommandQueue
(
context
,
listDevice
[
0
],
0
,
&
status
);
int
n
=
1
;
int
n
=
1
;
int
c
=
3
;
int
c
=
3
;
...
@@ -410,7 +406,7 @@ double PaddleMobile<GPU_CL, float>::GetPredictTime() {
...
@@ -410,7 +406,7 @@ double PaddleMobile<GPU_CL, float>::GetPredictTime() {
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
clFinish
(
queue
);
clFinish
(
queue
);
queue
=
clCreateCommandQueue
(
context
,
listDevice
[
0
],
0
,
&
status
);
//
queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
path
=
framework
::
CLEngine
::
Instance
()
->
GetCLPath
()
+
path
=
framework
::
CLEngine
::
Instance
()
->
GetCLPath
()
+
"/cl_kernel/conv_kernel.cl"
;
"/cl_kernel/conv_kernel.cl"
;
...
@@ -465,15 +461,18 @@ double PaddleMobile<GPU_CL, float>::GetPredictTime() {
...
@@ -465,15 +461,18 @@ double PaddleMobile<GPU_CL, float>::GetPredictTime() {
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
size_t
global_work_size2
[
3
]
=
{
8
,
224
,
224
};
size_t
global_work_size2
[
3
]
=
{
8
,
224
,
224
};
auto
time1
=
paddle_mobile
::
time
();
auto
time1
=
paddle_mobile
::
time
();
int
times
=
10
;
for
(
int
i
=
0
;
i
<
times
;
++
i
)
{
status
=
clEnqueueNDRangeKernel
(
queue
,
kernel
,
3
,
NULL
,
global_work_size2
,
status
=
clEnqueueNDRangeKernel
(
queue
,
kernel
,
3
,
NULL
,
global_work_size2
,
NULL
,
0
,
NULL
,
NULL
);
NULL
,
0
,
NULL
,
NULL
);
}
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
clFinish
(
queue
);
clFinish
(
queue
);
auto
time2
=
paddle_mobile
::
time
();
auto
time2
=
paddle_mobile
::
time
();
paddle_mobile
::
memory
::
Free
(
input
);
paddle_mobile
::
memory
::
Free
(
input
);
paddle_mobile
::
memory
::
Free
(
filter
);
paddle_mobile
::
memory
::
Free
(
filter
);
if
(
status
==
CL_SUCCESS
)
{
if
(
status
==
CL_SUCCESS
)
{
return
paddle_mobile
::
time_diff
(
time1
,
time2
);
return
paddle_mobile
::
time_diff
(
time1
,
time2
)
/
times
;
}
else
{
}
else
{
return
-
1
;
return
-
1
;
}
}
...
...
src/operators/activation_op.cpp
浏览文件 @
ba7458fa
...
@@ -66,6 +66,9 @@ REGISTER_OPERATOR_CL(relu, ops::ReluOp);
...
@@ -66,6 +66,9 @@ REGISTER_OPERATOR_CL(relu, ops::ReluOp);
#ifdef PADDLE_MOBILE_CPU
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
sigmoid
,
ops
::
SigmoidOp
);
REGISTER_OPERATOR_CPU
(
sigmoid
,
ops
::
SigmoidOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL
(
sigmoid
,
ops
::
SigmoidOp
);
#endif
#endif // SIGMOID_OP
#endif // SIGMOID_OP
#ifdef TANH_OP
#ifdef TANH_OP
...
...
src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
浏览文件 @
ba7458fa
...
@@ -561,7 +561,858 @@ __kernel void conv_1x1(__private const int global_size_dim0,
...
@@ -561,7 +561,858 @@ __kernel void conv_1x1(__private const int global_size_dim0,
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
}
__kernel void conv_1x1_spl(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
// int out_w1 = out_w + global_size_dim1;
// int out_w2 = out_w + global_size_dim1 * 2;
// int out_w3 = out_w + global_size_dim1 * 3;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE
half4 output0= read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output0 = 0.0f;
// half4 output1 = 0.0f;
// half4 output2 = 0.0f;
// half4 output3 = 0.0f;
#else
half4 output0 = 0.0f;
half4 output1 = 0.0f;
half4 output2 = 0.0f;
half4 output3 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
//
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
}
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
#endif
int outpos_main = mul24(out_c , old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
if (out_w1 < old_w){
write_imageh(output_image, output_pos1, output1);
}
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
if (out_w2 < old_w){
write_imageh(output_image, output_pos2, output2);
}
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
if (out_w3 < old_w){
write_imageh(output_image, output_pos3, output3);
}
}
__kernel void conv_1x1_spl2(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int out_w4 = out_w + global_size_dim1 * 4;
int out_w5 = out_w + global_size_dim1 * 5;
int out_w6 = out_w + global_size_dim1 * 6;
int out_w7 = out_w + global_size_dim1 * 7;
// int out_w1 = out_w + global_size_dim1;
// int out_w2 = out_w + global_size_dim1 * 2;
// int out_w3 = out_w + global_size_dim1 * 3;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
int2 in_pos_in_one_block4 =
ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
int2 in_pos_in_one_block5 =
ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
int2 in_pos_in_one_block6 =
ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
int2 in_pos_in_one_block7 =
ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
#ifdef BIASE
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output0 = 0.0f;
// half4 output1 = 0.0f;
// half4 output2 = 0.0f;
// half4 output3 = 0.0f;
#else
half4 output0 = 0.0f;
half4 output1 = 0.0f;
half4 output2 = 0.0f;
half4 output3 = 0.0f;
half4 output4 = 0.0f;
half4 output5 = 0.0f;
half4 output6 = 0.0f;
half4 output7 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
//
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
// -------------4--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
half4 input4 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output4 = mad(input4.x, weight0, output4);
output4 = mad(input4.y, weight1, output4);
output4 = mad(input4.z, weight2, output4);
output4 = mad(input4.w, weight3, output4);
// -------------5--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
half4 input5 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output5= mad(input5.x, weight0, output5);
output5 = mad(input5.y, weight1, output5);
output5 = mad(input5.z, weight2, output5);
output5 = mad(input5.w, weight3, output5);
// -------------6--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
half4 input6 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output6 = mad(input6.x, weight0, output6);
output6 = mad(input6.y, weight1, output6);
output6 = mad(input6.z, weight2, output6);
output6 = mad(input6.w, weight3, output6);
// -------------7--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
half4 input7 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output7 = mad(input7.x, weight0, output7);
output7 = mad(input7.y, weight1, output7);
output7 = mad(input7.z, weight2, output7);
output7 = mad(input7.w, weight3, output7);
}
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
output4 = activation(output4);
output5 = activation(output5);
output6 = activation(output6);
output7 = activation(output7);
#endif
int outpos_main = mul24(out_c , old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
if (out_w1 < old_w){
write_imageh(output_image, output_pos1, output1);
}
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
if (out_w2 < old_w){
write_imageh(output_image, output_pos2, output2);
}
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
if (out_w3 < old_w){
write_imageh(output_image, output_pos3, output3);
}
int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
if (out_w4 < old_w){
write_imageh(output_image, output_pos4, output4);
}
int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
if (out_w5 < old_w){
write_imageh(output_image, output_pos5, output5);
}
int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
if (out_w6 < old_w){
write_imageh(output_image, output_pos6, output6);
}
int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
if (out_w7 < old_w){
write_imageh(output_image, output_pos7, output7);
}
}
__kernel void conv_1x1_spl3(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
// int out_w3 = out_w + global_size_dim1 * 3;
// int out_w4 = out_w + global_size_dim1 * 4;
// int out_w5 = out_w + global_size_dim1 * 5;
// int out_w6 = out_w + global_size_dim1 * 6;
// int out_w7 = out_w + global_size_dim1 * 7;
// int out_w1 = out_w + global_size_dim1;
// int out_w2 = out_w + global_size_dim1 * 2;
// int out_w3 = out_w + global_size_dim1 * 3;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
// int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
// int2 in_pos_in_one_block2 =
// ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
//
// int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
// int2 in_pos_in_one_block3 =
// ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
//
// int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
// int2 in_pos_in_one_block4 =
// ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
//
// int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
// int2 in_pos_in_one_block5 =
// ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
//
// int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
// int2 in_pos_in_one_block6 =
// ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
//
// int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
// int2 in_pos_in_one_block7 =
// ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
#ifdef BIASE
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output0 = 0.0f;
// half4 output1 = 0.0f;
// half4 output2 = 0.0f;
// half4 output3 = 0.0f;
#else
half4 output0 = 0.0f;
half4 output1 = 0.0f;
// half4 output2 = 0.0f;
// half4 output3 = 0.0f;
// half4 output4 = 0.0f;
// half4 output5 = 0.0f;
// half4 output6 = 0.0f;
// half4 output7 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
//
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// * 4 + 3));
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
//
// // -------------2--------------
// pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
// half4 input2 = read_imageh(input_image, sampler, pos_in);
//
// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// // * 4 + 3));
//
// output2 = mad(input2.x, weight0, output2);
// output2 = mad(input2.y, weight1, output2);
// output2 = mad(input2.z, weight2, output2);
// output2 = mad(input2.w, weight3, output2);
//
// // -------------3--------------
// pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
// half4 input3 = read_imageh(input_image, sampler, pos_in);
//
// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// // * 4 + 3));
//
// output3 = mad(input3.x, weight0, output3);
// output3 = mad(input3.y, weight1, output3);
// output3 = mad(input3.z, weight2, output3);
// output3 = mad(input3.w, weight3, output3);
//
//
// // -------------4--------------
// pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
// half4 input4 = read_imageh(input_image, sampler, pos_in);
//
// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// // * 4 + 3));
//
// output4 = mad(input4.x, weight0, output4);
// output4 = mad(input4.y, weight1, output4);
// output4 = mad(input4.z, weight2, output4);
// output4 = mad(input4.w, weight3, output4);
//
//
//
// // -------------5--------------
// pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
// half4 input5 = read_imageh(input_image, sampler, pos_in);
//
// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// // * 4 + 3));
//
// output5= mad(input5.x, weight0, output5);
// output5 = mad(input5.y, weight1, output5);
// output5 = mad(input5.z, weight2, output5);
// output5 = mad(input5.w, weight3, output5);
//
//
// // -------------6--------------
// pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
// half4 input6 = read_imageh(input_image, sampler, pos_in);
//
// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// // * 4 + 3));
//
// output6 = mad(input6.x, weight0, output6);
// output6 = mad(input6.y, weight1, output6);
// output6 = mad(input6.z, weight2, output6);
// output6 = mad(input6.w, weight3, output6);
//
//
// // -------------7--------------
// pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
// half4 input7 = read_imageh(input_image, sampler, pos_in);
//
// // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
// // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
// // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
// // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
// // * 4 + 3));
//
// output7 = mad(input7.x, weight0, output7);
// output7 = mad(input7.y, weight1, output7);
// output7 = mad(input7.z, weight2, output7);
// output7 = mad(input7.w, weight3, output7);
}
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
//
// output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
// read_imageh(new_biase, sampler, (int2)(out_c, 0));
//
// output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
// read_imageh(new_biase, sampler, (int2)(out_c, 0));
//
// output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
// read_imageh(new_biase, sampler, (int2)(out_c, 0));
//
// output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
// read_imageh(new_biase, sampler, (int2)(out_c, 0));
//
// output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
// read_imageh(new_biase, sampler, (int2)(out_c, 0));
//
// output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
// read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
// output2 = activation(output2);
// output3 = activation(output3);
// output4 = activation(output4);
// output5 = activation(output5);
// output6 = activation(output6);
// output7 = activation(output7);
#endif
int outpos_main = mul24(out_c , old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
if (out_w1 < old_w){
write_imageh(output_image, output_pos1, output1);
}
//
// int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
// if (out_w2 < old_w){
// write_imageh(output_image, output_pos2, output2);
// }
//
// int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
// if (out_w3 < old_w){
// write_imageh(output_image, output_pos3, output3);
// }
//
// int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
// if (out_w4 < old_w){
// write_imageh(output_image, output_pos4, output4);
// }
//
// int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
// if (out_w5 < old_w){
// write_imageh(output_image, output_pos5, output5);
//
// }
// int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
// if (out_w6 < old_w){
// write_imageh(output_image, output_pos6, output6);
// }
//
// int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
// if (out_w7 < old_w){
// write_imageh(output_image, output_pos7, output7);
// }
}
//__kernel void conv_1x1_c(
// __private const int global_size_dim0,
// __private const int global_size_dim1,
// __private const int global_size_dim2,
// __read_only image2d_t input_image,
// __read_only image2d_t filter,
//#ifdef BIASE
// __read_only image2d_t bias,
//#endif
//#ifdef BATCH_NORM
// __read_only image2d_t new_scale,
// __read_only image2d_t new_biase,
//#endif
// __write_only image2d_t output_image,
// __private const int stride,
// __private const int offset,
// __private const int input_c,
// __private const int dilation,
// __private const int input_width, /* of one block */
// __private const int input_height, /* of one block */
// __private const int output_width,
// __private const int output_height,
// __private const int old_w) {
//
// const int out_c = get_global_id(0);
// const int out_w = get_global_id(1);
// const int out_nh = get_global_id(2);
//
// const sampler_t sampler =
// CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
// const int2 stride_xy = (int2)(stride, stride);
//
// for (int i = 0; i < input_c; ++i) {
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
// half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
// half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
// half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
//
//#pragma unroll
// for (int j = 0; j < 4; ++j) {
// int out_w0 = out_w + global_size_dim1 * j;
// int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
// int2 in_pos_in_one_block0 = ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
//
//#ifdef BIASE
// half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
//#else
// half4 output0 = 0.0f;
//#endif
// int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
// half4 input0 = read_imageh(input_image, sampler, pos_in);
//
// output0 = mad(input0.x, weight0, output0);
// output0 = mad(input0.y, weight1, output0);
// output0 = mad(input0.z, weight2, output0);
// output0 = mad(input0.w, weight3, output0);
//
//#ifdef BATCH_NORM
// output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
//#endif
//
//#ifdef RELU
// output0 = activation(output0);
//#endif
// int outpos_main = mul24(out_c, old_w);
// int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
//
// if (out_w0 < old_w) {
// write_imageh(output_image, output_pos0, output0);
// }
// }
// }
//}
/*
/*
...
...
src/operators/kernel/cl/cl_kernel/sigmoid.cl
0 → 100644
浏览文件 @
ba7458fa
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
sigmoid
(
__read_only
image2d_t
input,
__write_only
image2d_t
output
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
half4
in
=
read_imageh
(
input,
sampler,
(
int2
)(
x,
y
))
;
in
=
1.0f
/
(
1
+
exp
(
-in
))
;
write_imageh
(
output,
(
int2
)(
x,
y
)
,
in
)
;
}
\ No newline at end of file
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
浏览文件 @
ba7458fa
...
@@ -21,7 +21,7 @@ limitations under the License. */
...
@@ -21,7 +21,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
bool
optimise
=
true
;
template
<
>
template
<
>
bool
ConvAddBNReluKernel
<
GPU_CL
,
float
>::
Init
(
bool
ConvAddBNReluKernel
<
GPU_CL
,
float
>::
Init
(
FusionConvAddBNReluParam
<
GPU_CL
>
*
param
)
{
FusionConvAddBNReluParam
<
GPU_CL
>
*
param
)
{
...
@@ -139,7 +139,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
...
@@ -139,7 +139,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
cl_helper_
.
CLCommandQueue
());
if
(
optimise
)
{
this
->
cl_helper_
.
AddKernel
(
"conv_1x1_spl"
,
"conv_add_bn_relu_kernel.cl"
);
}
else
{
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_add_bn_relu_kernel.cl"
);
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_add_bn_relu_kernel.cl"
);
}
DLOG
<<
" conv add bn relu conv 1x1"
;
DLOG
<<
" conv add bn relu conv 1x1"
;
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
...
@@ -205,10 +210,13 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
...
@@ -205,10 +210,13 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
cl_int
status
;
cl_int
status
;
if
(
optimise
)
{
if
(
param
.
Filter
()
->
dims
()[
2
]
==
1
&&
param
.
Filter
()
->
dims
()[
3
]
==
1
)
{
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
int
maped_w
=
maptofactor
(
w
,
4
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
maped_w
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
...
@@ -256,30 +264,132 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
...
@@ -256,30 +264,132 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
status
=
clSetKernelArg
(
kernel
,
16
,
sizeof
(
int
),
&
output_height
);
status
=
clSetKernelArg
(
kernel
,
16
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
// cl_event out_event = param.Output()->GetClEvent(
);
status
=
clSetKernelArg
(
kernel
,
17
,
sizeof
(
int
),
&
w
);
// cl_event wait_event = param.Input()->GetClEvent(
);
CL_CHECK_ERRORS
(
status
);
/*
const
size_t
work_size
[
3
]
=
{
if (param.Filter()->dims()[2] == 1 &&
static_cast
<
const
uint32_t
>
(
default_work_size
.
data
()[
0
]),
param.Filter()->dims()[3] == 1 &&
static_cast
<
const
uint32_t
>
(
maped_w
),
param.Filter()->dims()[0] % 16 == 0) {
static_cast
<
const
uint32_t
>
(
default_work_size
.
data
()[
2
])};
DLOG << " before modifi work size: " << default_work_size;
default_work_size[0] = default_work_size[0] / 4;
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
else
{
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
DLOG << " modification work size: " << default_work_size;
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
DLOG << " input dims " << param.Input()->dims();
CL_CHECK_ERRORS
(
status
);
DLOG << " output dims " << param.Output()->dims();
DLOG << " filter dims: " << param.Filter()->dims();
DLOG << " biase dims : " << param.Bias()->dims();
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
new_scale
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
new_bias
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
16
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
}
*/
}
else
{
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
new_scale
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
new_bias
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
16
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
()
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
}
}
}
template
class
ConvAddBNReluKernel
<
GPU_CL
,
float
>;
template
class
ConvAddBNReluKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/conv_add_kernel.cpp
浏览文件 @
ba7458fa
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
bool
optimise_convadd
=
true
;
template
<
>
template
<
>
bool
ConvAddKernel
<
GPU_CL
,
float
>::
Init
(
FusionConvAddParam
<
GPU_CL
>
*
param
)
{
bool
ConvAddKernel
<
GPU_CL
,
float
>::
Init
(
FusionConvAddParam
<
GPU_CL
>
*
param
)
{
...
@@ -35,8 +36,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
...
@@ -35,8 +36,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
cl_helper_
.
CLCommandQueue
());
if
(
optimise_convadd
)
{
this
->
cl_helper_
.
AddKernel
(
"conv_1x1_spl"
,
"conv_add_kernel.cl"
);
}
else
{
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_add_kernel.cl"
);
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_add_kernel.cl"
);
}
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
==
3
)
{
param
->
Filter
()
->
dims
()[
2
]
==
3
)
{
...
@@ -95,10 +99,13 @@ void ConvAddKernel<GPU_CL, float>::Compute(
...
@@ -95,10 +99,13 @@ void ConvAddKernel<GPU_CL, float>::Compute(
cl_int
status
;
cl_int
status
;
if
(
optimise_convadd
&&
param
.
Filter
()
->
dims
()[
2
]
==
1
&&
param
.
Filter
()
->
dims
()[
3
]
==
1
)
{
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
int
maped_w
=
maptofactor
(
w
,
4
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
maped_w
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
...
@@ -140,13 +147,69 @@ void ConvAddKernel<GPU_CL, float>::Compute(
...
@@ -140,13 +147,69 @@ void ConvAddKernel<GPU_CL, float>::Compute(
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_height
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
// cl_event out_event = param.Output()->GetClEvent();
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
w
);
// cl_event wait_event = param.Input()->GetClEvent();
CL_CHECK_ERRORS
(
status
);
const
size_t
work_size
[
3
]
=
{
static_cast
<
const
uint32_t
>
(
default_work_size
.
data
()[
0
]),
static_cast
<
const
uint32_t
>
(
maped_w
),
static_cast
<
const
uint32_t
>
(
default_work_size
.
data
()[
2
])};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
else
{
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
()
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
}
}
}
template
class
ConvAddKernel
<
GPU_CL
,
float
>;
template
class
ConvAddKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/reshape_kernel.cpp
浏览文件 @
ba7458fa
...
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef RESHAPE_OP
#include "operators/kernel/reshape_kernel.h"
#include "operators/kernel/reshape_kernel.h"
...
@@ -102,3 +103,4 @@ template class ReshapeKernel<GPU_CL, float>;
...
@@ -102,3 +103,4 @@ template class ReshapeKernel<GPU_CL, float>;
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
src/operators/kernel/cl/sigmoid_kernel.cpp
0 → 100644
浏览文件 @
ba7458fa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SIGMOID_OP
#include "operators/kernel/activation_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
SigmoidKernel
<
GPU_CL
,
float
>::
Init
(
SigmoidParam
<
GPU_CL
>*
param
)
{
this
->
cl_helper_
.
AddKernel
(
"sigmoid"
,
"sigmoid.cl"
);
return
true
;
}
template
<
>
void
SigmoidKernel
<
GPU_CL
,
float
>::
Compute
(
const
SigmoidParam
<
GPU_CL
>&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
const
auto
*
input
=
param
.
InputX
();
auto
*
output
=
param
.
Out
();
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
output
);
auto
inputImage
=
input
->
GetCLImage
();
auto
outputImage
=
output
->
GetCLImage
();
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputImage
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
const
size_t
work_size
[
2
]
=
{
input
->
ImageWidth
(),
input
->
ImageHeight
()};
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
}
template
class
SigmoidKernel
<
GPU_CL
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_bn_relu_kernel.h
浏览文件 @
ba7458fa
...
@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
...
@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
public:
public:
void
Compute
(
const
FusionConvAddBNReluParam
<
DeviceType
>
&
param
);
void
Compute
(
const
FusionConvAddBNReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionConvAddBNReluParam
<
DeviceType
>
*
param
);
bool
Init
(
FusionConvAddBNReluParam
<
DeviceType
>
*
param
);
inline
int
maptofactor
(
int
i
,
int
factor
)
{
return
(
i
+
factor
-
1
)
/
factor
;
}
};
};
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/conv_add_kernel.h
浏览文件 @
ba7458fa
...
@@ -41,6 +41,9 @@ class ConvAddKernel
...
@@ -41,6 +41,9 @@ class ConvAddKernel
public:
public:
void
Compute
(
const
FusionConvAddParam
<
DeviceType
>
&
param
);
void
Compute
(
const
FusionConvAddParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionConvAddParam
<
DeviceType
>
*
param
);
bool
Init
(
FusionConvAddParam
<
DeviceType
>
*
param
);
inline
int
maptofactor
(
int
i
,
int
factor
)
{
return
(
i
+
factor
-
1
)
/
factor
;
}
};
};
}
// namespace operators
}
// namespace operators
...
...
src/operators/op_param.cpp
浏览文件 @
ba7458fa
...
@@ -44,12 +44,17 @@ template class ConvParam<FPGA>;
...
@@ -44,12 +44,17 @@ template class ConvParam<FPGA>;
template
class
ConvParam
<
GPU_MALI
>;
template
class
ConvParam
<
GPU_MALI
>;
#endif
#endif
#ifdef ELEMENTWISEADD_OP
template
class
ElementwiseAddParam
<
CPU
>;
template
class
ElementwiseAddParam
<
CPU
>;
template
class
ElementwiseAddParam
<
FPGA
>;
template
class
ElementwiseAddParam
<
FPGA
>;
template
class
ElementwiseAddParam
<
GPU_MALI
>;
template
class
ElementwiseAddParam
<
GPU_MALI
>;
#endif
#ifdef ELEMENTWISEMUL_OP
template
class
ElementwiseMulParam
<
CPU
>;
template
class
ElementwiseMulParam
<
CPU
>;
template
class
ElementwiseMulParam
<
FPGA
>;
template
class
ElementwiseMulParam
<
FPGA
>;
template
class
ElementwiseMulParam
<
GPU_MALI
>;
template
class
ElementwiseMulParam
<
GPU_MALI
>;
#endif
#ifdef MUL_OP
#ifdef MUL_OP
template
class
MulParam
<
CPU
>;
template
class
MulParam
<
CPU
>;
...
...
test/CMakeLists.txt
浏览文件 @
ba7458fa
...
@@ -154,6 +154,15 @@ if (CON GREATER -1)
...
@@ -154,6 +154,15 @@ if (CON GREATER -1)
endif
()
endif
()
list
(
FIND NET
"op"
CON
)
if
(
CON GREATER -1
)
# gen test
ADD_EXECUTABLE
(
test-sigmoid operators/test_sigmoid_op.cpp test_include.h
)
target_link_libraries
(
test-sigmoid paddle-mobile
)
set
(
FOUND_MATCH ON
)
endif
()
if
(
NOT FOUND_MATCH
)
if
(
NOT FOUND_MATCH
)
# gen test
# gen test
ADD_EXECUTABLE
(
test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h
)
ADD_EXECUTABLE
(
test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h
)
...
...
test/net/test_mobilenet_GPU.cpp
浏览文件 @
ba7458fa
...
@@ -25,11 +25,11 @@ int main() {
...
@@ -25,11 +25,11 @@ int main() {
paddle_mobile
.
SetCLPath
(
"/data/local/tmp/bin"
);
paddle_mobile
.
SetCLPath
(
"/data/local/tmp/bin"
);
#endif
#endif
auto
isok
=
paddle_mobile
.
Load
(
//
auto isok = paddle_mobile.Load(
std
::
string
(
g_mobilenet_vision
)
+
"/vision_mobilenet_model"
,
//
std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
std
::
string
(
g_mobilenet_vision
)
+
"/vision_mobilenet_params"
,
true
);
//
std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true);
//
auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
g_mobilenet
),
true
);
if
(
isok
)
{
if
(
isok
)
{
auto
time2
=
paddle_mobile
::
time
();
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
...
@@ -37,8 +37,7 @@ int main() {
...
@@ -37,8 +37,7 @@ int main() {
std
::
vector
<
float
>
input
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
GetInput
<
float
>
(
g_test_image_1x3x224x224_vision_mobilenet_input
,
&
input
,
GetInput
<
float
>
(
g_test_image_1x3x224x224_banana
,
&
input
,
dims
);
dims
);
std
::
vector
<
float
>
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
std
::
vector
<
float
>
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
...
...
tools/build.sh
浏览文件 @
ba7458fa
#!/usr/bin/env bash
#!/usr/bin/env bash
NETS
=
""
NETS
=
""
declare
-a
supportedNets
=(
"googlenet"
"mobilenet"
"yolo"
"squeezenet"
"resnet"
"mobilenetssd"
"nlp"
"mobilenetfssd"
"genet"
"super"
)
declare
-a
supportedNets
=(
"googlenet"
"mobilenet"
"yolo"
"squeezenet"
"resnet"
"mobilenetssd"
"nlp"
"mobilenetfssd"
"genet"
"super"
"op"
)
build_for_mac
()
{
build_for_mac
()
{
if
[
!
`
which brew
`
]
;
then
if
[
!
`
which brew
`
]
;
then
...
...
tools/op.cmake
浏览文件 @
ba7458fa
...
@@ -228,6 +228,12 @@ if (CON GREATER -1)
...
@@ -228,6 +228,12 @@ if (CON GREATER -1)
set
(
FOUND_MATCH ON
)
set
(
FOUND_MATCH ON
)
endif
()
endif
()
list
(
FIND NET
"op"
CON
)
if
(
CON GREATER -1
)
message
(
"op enabled"
)
set
(
SIGMOID_OP ON
)
set
(
FOUND_MATCH ON
)
endif
()
if
(
NOT FOUND_MATCH
)
if
(
NOT FOUND_MATCH
)
message
(
"--default--"
)
message
(
"--default--"
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录