Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
f07dd516
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
f07dd516
编写于
11月 16, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor the opencl kernel build logic.
上级
77ea99f5
变更
27
显示空白变更内容
内联
并排
Showing
27 changed file
with
509 addition
and
440 deletion
+509
-440
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+28
-80
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+7
-7
mace/core/types.cc
mace/core/types.cc
+29
-0
mace/core/types.h
mace/core/types.h
+3
-0
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+2
-4
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+3
-3
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+5
-5
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+8
-5
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+14
-12
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+3
-0
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+66
-65
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+86
-34
mace/kernels/opencl/cl/conv_helper.cl
mace/kernels/opencl/cl/conv_helper.cl
+0
-41
mace/kernels/opencl/cl/conv_helper.h
mace/kernels/opencl/cl/conv_helper.h
+0
-15
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+91
-32
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+76
-69
mace/kernels/opencl/cl/relu.cl
mace/kernels/opencl/cl/relu.cl
+12
-9
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+13
-10
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+14
-11
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+7
-5
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+7
-6
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+8
-5
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+16
-14
mace/kernels/opencl/relu_opencl.cc
mace/kernels/opencl/relu_opencl.cc
+4
-3
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+3
-2
mace/kernels/opencl/space_to_batch_opecl.cc
mace/kernels/opencl/space_to_batch_opecl.cc
+3
-3
mace/proto/mace.proto
mace/proto/mace.proto
+1
-0
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
f07dd516
...
...
@@ -7,8 +7,6 @@
#include <memory>
#include <mutex>
#include <dirent.h>
#include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
...
...
@@ -32,55 +30,6 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
return
true
;
}
bool
BuildProgram
(
OpenCLRuntime
*
runtime
,
const
std
::
string
&
path
,
cl
::
Program
*
program
)
{
MACE_CHECK_NOTNULL
(
program
);
auto
closer
=
[](
DIR
*
d
)
{
if
(
d
!=
nullptr
)
closedir
(
d
);
};
std
::
unique_ptr
<
DIR
,
decltype
(
closer
)
>
dir
(
opendir
(
path
.
c_str
()),
closer
);
MACE_CHECK_NOTNULL
(
dir
.
get
());
const
std
::
string
kSourceSuffix
=
".cl"
;
cl
::
Program
::
Sources
sources
;
errno
=
0
;
dirent
*
entry
=
readdir
(
dir
.
get
());
MACE_CHECK
(
errno
==
0
);
while
(
entry
!=
nullptr
)
{
if
(
entry
->
d_type
==
DT_REG
)
{
std
::
string
d_name
(
entry
->
d_name
);
if
(
d_name
.
size
()
>
kSourceSuffix
.
size
()
&&
d_name
.
compare
(
d_name
.
size
()
-
kSourceSuffix
.
size
(),
kSourceSuffix
.
size
(),
kSourceSuffix
)
==
0
)
{
std
::
string
filename
=
path
+
d_name
;
std
::
string
kernel_source
;
MACE_CHECK
(
ReadSourceFile
(
filename
,
&
kernel_source
));
sources
.
push_back
({
kernel_source
.
c_str
(),
kernel_source
.
length
()});
}
}
entry
=
readdir
(
dir
.
get
());
MACE_CHECK
(
errno
==
0
);
};
*
program
=
cl
::
Program
(
runtime
->
context
(),
sources
);
std
::
string
build_options
=
"-Werror -cl-mad-enable -cl-fast-relaxed-math -I"
+
path
;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
cl_int
ret
=
program
->
build
({
runtime
->
device
()},
build_options
.
c_str
());
if
(
ret
!=
CL_SUCCESS
)
{
if
(
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_STATUS
>
(
runtime
->
device
())
==
CL_BUILD_ERROR
)
{
std
::
string
build_log
=
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_LOG
>
(
runtime
->
device
());
LOG
(
INFO
)
<<
"Program build log: "
<<
build_log
;
}
LOG
(
FATAL
)
<<
"Build program failed: "
<<
ret
;
}
return
true
;
}
}
// namespace
...
...
@@ -156,58 +105,57 @@ cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl
::
Program
&
OpenCLRuntime
::
program
()
{
// TODO(heliangliang) Support binary format
static
const
char
*
kernel_path
=
getenv
(
"MACE_KERNEL_PATH"
);
std
::
string
path
(
kernel_path
==
nullptr
?
""
:
kernel_path
);
std
::
call_once
(
build_flag_
,
[
this
,
&
path
]()
{
MACE_CHECK
(
BuildProgram
(
this
,
path
,
&
program_
));
});
return
program_
;
}
const
std
::
unodered_map
<
std
::
string
,
std
::
string
>
OpenCLRuntime
::
kernel_program_map_
=
{
{
"BatchNorm"
,
"batch_norm.cl"
}
const
std
::
map
<
std
::
string
,
std
::
string
>
OpenCLRuntime
::
program_map_
=
{
{
"addn"
,
"addn.cl"
},
{
"batch_norm"
,
"batch_norm.cl"
},
{
"conv_2d_1x1"
,
"conv_2d_1x1.cl"
},
{
"conv_2d_3x3"
,
"conv_2d_3x3.cl"
},
{
"depthwise_conv_3x3"
,
"depthwise_conv_3x3.cl"
},
{
"pooling"
,
"pooling.cl"
},
{
"relu"
,
"relu.cl"
},
{
"resize_bilinear"
,
"resize_bilinear.cl"
},
{
"space_to_batch"
,
"space_to_batch.cl"
},
};
bool
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
kernel
_name
,
void
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_file
_name
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
)
{
MACE_CHECK_NOTNULL
(
program
);
cl
::
Program
::
Sources
sources
;
std
::
string
filename
=
kernel_path_
+
kernel
_name
;
std
::
string
filename
=
kernel_path_
+
program_file
_name
;
std
::
string
kernel_source
;
MACE_CHECK
(
ReadSourceFile
(
filename
,
&
kernel_source
));
sources
.
push_back
({
kernel_source
.
c_str
(),
kernel_source
.
length
()});
*
program
=
cl
::
Program
(
this
->
context
(),
sources
);
build_options
+=
" -Werror -cl-mad-enable -cl-fast-relaxed-math -I"
+
path
;
std
::
string
build_options_str
=
build_options
+
" -Werror -cl-mad-enable -cl-fast-relaxed-math -I"
+
kernel_path_
;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
cl_int
ret
=
program
->
build
({
runtime
->
device
()},
build_options
.
c_str
());
cl_int
ret
=
program
->
build
({
device
()},
build_options_str
.
c_str
());
if
(
ret
!=
CL_SUCCESS
)
{
if
(
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_STATUS
>
(
runtime
->
device
())
==
if
(
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_STATUS
>
(
device
())
==
CL_BUILD_ERROR
)
{
std
::
string
build_log
=
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_LOG
>
(
runtime
->
device
());
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_LOG
>
(
device
());
LOG
(
INFO
)
<<
"Program build log: "
<<
build_log
;
}
LOG
(
FATAL
)
<<
"Build program failed: "
<<
ret
;
}
return
true
;
}
cl
::
Kernel
OpenCLRuntime
::
BuildKernel
(
const
std
::
string
&
kernel_name
,
cl
::
Kernel
OpenCLRuntime
::
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
)
{
auto
kernel_program_it
=
kernel_program_map_
.
find
(
kernel
_name
);
if
(
kernel_program_it
==
kernel_
program_map_
.
end
())
{
MACE_CHECK
(
false
,
kernel
_name
,
" opencl kernel doesn't exist."
);
auto
kernel_program_it
=
program_map_
.
find
(
program
_name
);
if
(
kernel_program_it
==
program_map_
.
end
())
{
MACE_CHECK
(
false
,
program
_name
,
" opencl kernel doesn't exist."
);
}
std
::
string
program_name
=
kernel_program_it
->
second
;
std
::
string
program_
file_
name
=
kernel_program_it
->
second
;
std
::
string
build_options_str
;
for
(
auto
&
option
:
build_options
)
{
build_options_str
+=
" "
+
option
;
...
...
@@ -219,10 +167,10 @@ cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name,
if
(
built_program_it
!=
built_program_map_
.
end
())
{
program
=
built_program_it
->
second
;
}
else
{
this
->
BuildProgram
(
kernel
_name
,
build_options_str
,
&
program
);
built_program_map_
.
emplace
(
built_program_key
,
std
::
move
(
program
)
);
this
->
BuildProgram
(
program_file
_name
,
build_options_str
,
&
program
);
built_program_map_
.
emplace
(
built_program_key
,
program
);
}
return
cl
::
Kernel
(
kernel_name
,
program
);
return
cl
::
Kernel
(
program
,
kernel_name
.
c_str
()
);
}
uint32_t
OpenCLRuntime
::
GetDeviceMaxWorkGroupSize
()
{
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
f07dd516
...
...
@@ -7,7 +7,7 @@
#include <map>
#include <mutex>
#include <
unordered_map
>
#include <
set
>
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
...
...
@@ -25,7 +25,8 @@ class OpenCLRuntime {
uint32_t
GetDeviceMaxWorkGroupSize
();
uint32_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
kernel_name
,
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
private:
OpenCLRuntime
(
cl
::
Context
context
,
...
...
@@ -35,7 +36,7 @@ class OpenCLRuntime {
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
bool
BuildProgram
(
const
std
::
string
&
kernel_name
,
void
BuildProgram
(
const
std
::
string
&
kernel_name
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
);
...
...
@@ -44,11 +45,10 @@ class OpenCLRuntime {
cl
::
Device
device_
;
cl
::
CommandQueue
command_queue_
;
cl
::
Program
program_
;
std
::
once_flag
build_flag_
;
std
::
string
kernel_path_
;
static
const
std
::
unordered_
map
<
std
::
string
,
std
::
string
>
kernel_
program_map_
;
mutable
std
::
unordered_
map
<
std
::
string
,
static
const
std
::
map
<
std
::
string
,
std
::
string
>
program_map_
;
mutable
std
::
map
<
std
::
string
,
cl
::
Program
>
built_program_map_
;
};
...
...
mace/core/types.cc
浏览文件 @
f07dd516
...
...
@@ -12,6 +12,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
case
DT_DOUBLE
:
case
DT_INT32
:
case
DT_INT64
:
case
DT_UINT32
:
case
DT_UINT16
:
case
DT_UINT8
:
case
DT_INT16
:
...
...
@@ -23,4 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
}
}
std
::
string
DataTypeToCLType
(
const
DataType
dt
)
{
switch
(
dt
)
{
case
DT_FLOAT
:
return
"float"
;
case
DT_HALF
:
return
"half"
;
case
DT_UINT8
:
return
"uchar"
;
case
DT_INT8
:
return
"char"
;
case
DT_DOUBLE
:
return
"double"
;
case
DT_INT32
:
return
"int"
;
case
DT_UINT32
:
return
"int"
;
case
DT_UINT16
:
return
"ushort"
;
case
DT_INT16
:
return
"short"
;
case
DT_INT64
:
return
"long"
;
default:
LOG
(
FATAL
)
<<
"Unsupported data type"
;
return
""
;
}
}
}
// namespace mace
\ No newline at end of file
mace/core/types.h
浏览文件 @
f07dd516
...
...
@@ -12,6 +12,8 @@ namespace mace {
bool
DataTypeCanUseMemcpy
(
DataType
dt
);
std
::
string
DataTypeToCLType
(
const
DataType
dt
);
template
<
class
T
>
struct
IsValidDataType
;
...
...
@@ -50,6 +52,7 @@ MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
MATCH_TYPE_AND_ENUM
(
int8_t
,
DT_INT8
);
MATCH_TYPE_AND_ENUM
(
string
,
DT_STRING
);
MATCH_TYPE_AND_ENUM
(
int64_t
,
DT_INT64
);
MATCH_TYPE_AND_ENUM
(
uint32_t
,
DT_UINT32
);
MATCH_TYPE_AND_ENUM
(
bool
,
DT_BOOL
);
static
const
int32_t
kint32_tmax
=
((
int32_t
)
0x7FFFFFFF
);
...
...
mace/kernels/batch_norm.h
浏览文件 @
f07dd516
...
...
@@ -76,9 +76,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const
Tensor
*
epsilon
,
Tensor
*
output
);
template
<
typename
T
>
struct
BatchNormFunctor
<
DeviceType
::
OPENCL
,
T
>
{
void
operator
()(
template
<
>
void
BatchNormFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
scale
,
const
Tensor
*
offset
,
...
...
@@ -86,7 +85,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> {
const
Tensor
*
var
,
const
Tensor
*
epsilon
,
Tensor
*
output
);
};
}
// namepsace kernels
}
// namespace mace
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
f07dd516
...
...
@@ -15,9 +15,9 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
const
uint32_t
gws
=
blocks
;
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
()
;
auto
addn_kernel
=
cl
::
Kernel
(
program
,
"add2"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
output
->
dtype
()));
auto
addn_kernel
=
runtime
->
BuildKernel
(
"addn"
,
"add2"
,
built_options
);
const
uint32_t
lws
=
runtime
->
GetKernelMaxWorkGroupSize
(
addn_kernel
);
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
f07dd516
...
...
@@ -10,8 +10,8 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
void
BatchNormFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
template
<
>
void
BatchNormFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
scale
,
const
Tensor
*
offset
,
...
...
@@ -29,8 +29,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Get
();
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DD
ataType="
+
GetDataTypeFromEnum
(
input
->
dtype
()));
auto
bm_kernel
=
runtime
->
CreateKernel
(
"batch_norm"
);
built_options
.
emplace
(
"-DD
ATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
auto
bm_kernel
=
runtime
->
BuildKernel
(
"batch_norm"
,
"batch_norm"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
const
std
::
vector
<
uint32_t
>
lws
=
{
1
,
1
,
kwg_size
};
...
...
@@ -63,7 +63,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]));
MACE_CHECK
(
error
==
CL_SUCCESS
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
...
...
mace/kernels/opencl/cl/addn.cl
浏览文件 @
f07dd516
__kernel
void
add2
(
__global
const
float
*input0,
__global
const
float
*input1,
#
include
<common.h>
//
Supported
data
type:
half/float
__kernel
void
add2
(
__global
const
DATA_TYPE
*input0,
__global
const
DATA_TYPE
*input1,
__private
const
int
size,
__global
float
*output
)
{
__global
DATA_TYPE
*output
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
+
4
>
size
)
{
...
...
@@ -9,8 +12,8 @@ __kernel void add2(__global const float *input0,
*
(
output+idx
)
=
*
(
input0+idx
)
+
*
(
input1+idx
)
;
}
}
else
{
float4
in_data0
=
vload4
(
idx,
input0
)
;
float4
in_data1
=
vload4
(
idx,
input1
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in_data0
=
vload4
(
idx,
input0
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in_data1
=
vload4
(
idx,
input1
)
;
vstore4
(
in_data0+in_data1,
idx,
output
)
;
}
}
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
f07dd516
void
kernel
batch_norm
(
global
const
float
*input,
global
const
float
*scale,
global
const
float
*offset,
global
const
float
*mean,
global
const
float
*var,
global
const
float
*epsilon,
#
include
<common.h>
//
Supported
data
types:
half/float
void
kernel
batch_norm
(
global
const
DATA_TYPE
*input,
global
const
DATA_TYPE
*scale,
global
const
DATA_TYPE
*offset,
global
const
DATA_TYPE
*mean,
global
const
DATA_TYPE
*var,
global
const
DATA_TYPE
*epsilon,
private
const
int
pixels,
global
float
*output,
__local
float4
*new_scale,
__local
float4
*new_offset
)
{
global
DATA_TYPE
*output,
__local
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
*new_scale,
__local
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
*new_offset
)
{
const
int
batch
=
get_global_id
(
0
)
;
const
int
channel
=
get_global_id
(
1
)
;
const
int
channels
=
get_global_size
(
1
)
;
...
...
@@ -23,8 +25,8 @@ void kernel batch_norm(global const float *input,
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
int
image_offset
=
(
batch
*
channels
+
channel
)
*
pixels
+
pixel_offset*4
;
const
float
*input_ptr
=
input
+
image_offset
;
float
*output_ptr
=
output
+
image_offset
;
const
DATA_TYPE
*input_ptr
=
input
+
image_offset
;
DATA_TYPE
*output_ptr
=
output
+
image_offset
;
const
int
end
=
(
batch
*
channels
+
channel
+
1
)
*
pixels
;
if
((
image_offset+4
)
>
end
)
{
for
(
int
i
=
image_offset
; i < end; ++i) {
...
...
@@ -33,7 +35,7 @@ void kernel batch_norm(global const float *input,
++output_ptr
;
}
}
else
{
float4
values
=
vload4
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
vload4
(
0
,
input_ptr
)
;
values
=
values
*
new_scale[local_channel]
+
new_offset[local_channel]
;
vstore4
(
values,
0
,
output_ptr
)
;
}
...
...
mace/kernels/opencl/cl/common.h
浏览文件 @
f07dd516
...
...
@@ -8,4 +8,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#define VEC_DATA_TYPE_STR(data_type, size) data_type##size
#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
f07dd516
...
...
@@ -25,31 +25,31 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
}
#
define
vec_conv_2d_1x1_s1
\
float4
in0
=
vload4
(
0
,
input_ptr
)
; \
float4
in1
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
float4
in2
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
float4
in3
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in0
=
vload4
(
0
,
input_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in1
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in2
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in3
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
;
#
define
vec_conv_2d_1x1_s2
\
float4
in00
=
vload4
(
0
,
input_ptr
)
; \
float3
in01
=
vload3
(
0
,
input_ptr
+
4
)
; \
float4
in10
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
float3
in11
=
vload3
(
0
,
input_ptr
+
in_pixel
+
4
)
; \
float4
in20
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
float3
in21
=
vload3
(
0
,
input_ptr
+
2
*
in_pixel
+
4
)
;\
float4
in30
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
; \
float3
in31
=
vload3
(
0
,
input_ptr
+
3
*
in_pixel
+
4
)
; \
float4
in0
=
(
float4
)(
in00.s02,
in01.s02
)
; \
float4
in1
=
(
float4
)(
in10.s02,
in11.s02
)
; \
float4
in2
=
(
float4
)(
in20.s02,
in21.s02
)
; \
float4
in3
=
(
float4
)(
in30.s02,
in31.s02
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in00
=
vload4
(
0
,
input_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in01
=
vload3
(
0
,
input_ptr
+
4
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in10
=
vload4
(
0
,
input_ptr
+
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in11
=
vload3
(
0
,
input_ptr
+
in_pixel
+
4
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in20
=
vload4
(
0
,
input_ptr
+
2
*
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in21
=
vload3
(
0
,
input_ptr
+
2
*
in_pixel
+
4
)
;\
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in30
=
vload4
(
0
,
input_ptr
+
3
*
in_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,3
)
in31
=
vload3
(
0
,
input_ptr
+
3
*
in_pixel
+
4
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in0
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
)
)(
in00.s02,
in01.s02
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in1
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
)
)(
in10.s02,
in11.s02
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
)
)(
in20.s02,
in21.s02
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
in3
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
)
)(
in30.s02,
in31.s02
)
;
#
define
vec_conv_2d_1x1_compute_loop
\
for
(
int
oc
=
0
; oc < 4; ++oc) { \
float4
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
; \
float4
out
=
vload4
(
0
,
output_ptr
+
oc
*
out_pixel
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
out
=
vload4
(
0
,
output_ptr
+
oc
*
out_pixel
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in1
*
weights.y
; \
out
+=
in2
*
weights.z
; \
...
...
@@ -58,25 +58,27 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
}
#
define
vec_conv_2d_1x1_compute
\
float4
weights
=
vload4
(
0
,
filter_ptr
)
; \
float4
out
=
vload4
(
0
,
output_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
weights
=
vload4
(
0
,
filter_ptr
)
; \
VEC_DATA_TYPE
(
DATA_TYPE,4
)
out
=
vload4
(
0
,
output_ptr
)
; \
out
+=
in0
*
weights.x
; \
out
+=
in1
*
weights.y
; \
out
+=
in2
*
weights.z
; \
out
+=
in3
*
weights.w
; \
vstore4
(
out,
0
,
output_ptr
)
;
__kernel
void
conv_2d_1x1_v2
(
__global
const
float
*input,
/*
n,
c,
h,
w
*/
__global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
__global
const
float
*bias,
/*
o
*/
__global
float
*output,
/*
n,
c,
h,
w
*/
//
Supported
data
type:
half/float
__kernel
void
conv_2d_1x1_v2
(
__global
const
DATA_TYPE
*input,
/*
n,
c,
h,
w
*/
__global
const
DATA_TYPE
*filter,
/*
o,
i,
kh,
kw
*/
#
ifdef
BIAS
__global
const
DATA_TYPE
*bias,
/*
o
*/
#
endif
/*
defined
(
BIAS
)
*/
__global
DATA_TYPE
*output,
/*
n,
c,
h,
w
*/
__private
const
int
in_chan_num,
__private
const
int
out_chan_num,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
out_height,
__private
const
int
out_width,
__private
const
int
stride
)
{
__private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
...
...
@@ -92,20 +94,30 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
#
ifdef
STRIDE_1
const
int
stride
=
1
;
#
else
const
int
stride
=
2
;
#
endif
const
int
in_pixel_begin
=
out_pixel_height
*
stride
*
in_width
+
out_pixel_width
*
stride
*
4
;
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
int
out_chan_len
=
out_chan_end
-
out_chan_begin
;
int
pixel_len
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
float
bias_value
=
bias
==
NULL
?
0
:
bias[out_chan]
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
ifdef
BIAS
DATA_TYPE
bias_value
=
bias[out_chan]
;
#
else
DATA_TYPE
bias_value
=
0
;
#
endif
for
(
int
p
=
0
; p < pixel_len; ++p) {
output_ptr[p]
=
bias_value
;
}
...
...
@@ -113,48 +125,37 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
int
in_chan
=
0
;
if
(
pixel_len
==
4
)
{
if
(
stride
==
1
)
{
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
const
float
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
int
out_chan
=
out_chan_begin
;
for
(
; out_chan + 3 < out_chan_end; out_chan += 4) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
vec_conv_2d_1x1_s1
;
vec_conv_2d_1x1_compute_loop
;
}
for
(
; out_chan < out_chan_end; ++out_chan) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
const
DATA_TYPE*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
ifdef
STRIDE_1
vec_conv_2d_1x1_s1
;
vec_conv_2d_1x1_compute
;
}
}
}
else
if
(
stride
==
2
)
{
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
const
float
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
int
out_chan
=
out_chan_begin
;
for
(
; out_chan + 3 < out_chan_end; out_chan += 4) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
else
vec_conv_2d_1x1_s2
;
#
endif
vec_conv_2d_1x1_compute_loop
;
}
for
(
; out_chan < out_chan_end; ++out_chan) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
const
DATA_TYPE*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
#
ifdef
STRIDE_1
vec_conv_2d_1x1_s1
;
#
else
vec_conv_2d_1x1_s2
;
#
endif
vec_conv_2d_1x1_compute
;
}
}
}
}
for
(
; in_chan < in_chan_num; ++in_chan) {
const
float
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan
*
in_pixel
;
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
float
weights
=
filter[out_chan
*
in_chan_num
+
in_chan]
;
float
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
DATA_TYPE
weights
=
filter[out_chan
*
in_chan_num
+
in_chan]
;
DATA_TYPE
*output_ptr
=
output_base
+
out_chan
*
out_pixel
;
for
(
int
p
=
0
; p < pixel_len; ++p) {
float
in
=
input_ptr[p*stride]
;
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
f07dd516
#
include
<conv_helper.h>
void
kernel
conv_2d_3x3
(
global
const
float
*input,
global
const
float
*filter,
global
const
float
*bias,
global
float
*output,
#
include
<common.h>
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s1
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
vload4
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
input1
=
vload2
(
0
,
input_ptr+4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input1.s0
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s23,
input1.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s2
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,8
)
input
=
vload8
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
input.even
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
input.odd
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input_ptr[8]
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
//
Supported
data
type:
half/float
DATA_TYPE
conv3x3
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr,
const
int
row_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,3
)
input_value
=
vload3
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_value
=
vload3
(
0
,
filter_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
res
=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
1
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
2
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
return
res.s0
+
res.s1
+
res.s2
;
}
void
kernel
conv_2d_3x3
(
global
const
DATA_TYPE
*input,
global
const
DATA_TYPE
*filter,
#
ifdef
BIAS
global
const
DATA_TYPE
*bias,
#
endif
global
DATA_TYPE
*output,
private
const
int
in_chan_num,
private
const
int
out_chan_num,
private
const
int
in_height,
private
const
int
in_width,
private
const
int
out_height,
private
const
int
out_width,
private
const
int
stride_h,
private
const
int
stride_w
)
{
private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
...
...
@@ -26,46 +70,54 @@ void kernel conv_2d_3x3(global const float *input,
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
const
int
in_pixel_begin
=
out_pixel_height
*
stride_h
*
in_width
+
out_pixel_width
*
stride_w
*
4
;
#
ifdef
STRIDE_1
const
int
stride
=
1
;
#
else
const
int
stride
=
2
;
#
endif
const
int
in_pixel_begin
=
out_pixel_height
*
stride
*
in_width
+
out_pixel_width
*
stride
*
4
;
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
float
*filter_base
=
filter
+
i
*
in_chan_num
*
9
;
DATA_TYPE
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
DATA_TYPE
*filter_base
=
filter
+
i
*
in_chan_num
*
9
;
if
(
pixels
==
4
)
{
#
ifdef
BIAS
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
res
=
(
VEC_DATA_TYPE
(
DATA_TYPE,
4
))
bias[i]
;
#
else
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
res
=
0
;
#
endif
float4
res
=
bias
==
NULL
?
0
:
(
float4
)
bias[i]
;
if
(
stride_w
==
1
)
{
for
(
int
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
DATA_TYPE
*filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
#
ifdef
STRIDE_1
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
}
else
{
for
(
int
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
#
else
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
#
endif
}
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
int
p
=
0
; p < pixels; ++p) {
float
res
=
bias
==
NULL
?
0
:
bias[i]
;
#
ifdef
BIAS
DATA_TYPE
res
=
bias[i]
;
#
else
DATA_TYPE
res
=
0
;
#
endif
for
(
uint
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
+
p
*
stride_w
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
+
p
*
stride
;
const
DATA_TYPE
*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
res
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
}
output_ptr[p]
=
res
;
...
...
mace/kernels/opencl/cl/conv_helper.cl
已删除
100644 → 0
浏览文件 @
77ea99f5
float4
conv1x3_s1
(
const
float
*input_ptr,
const
float
*filter_ptr
)
{
float4
row0
=
vload4
(
0
,
input_ptr
)
;
float2
input1
=
vload2
(
0
,
input_ptr+4
)
;
float4
row1
=
(
float4
)(
row0.s123,
input1.s0
)
;
float4
row2
=
(
float4
)(
row0.s23,
input1.s01
)
;
float3
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
float4
)
filter_values.s0
*
row0
+
(
float4
)
filter_values.s1
*
row1
+
(
float4
)
filter_values.s2
*
row2
;
}
float4
conv1x3_s2
(
const
float
*input_ptr,
const
float
*filter_ptr
)
{
float8
input
=
vload8
(
0
,
input_ptr
)
;
float4
row0
=
input.even
;
float4
row1
=
input.odd
;
float4
row2
=
(
float4
)(
row0.s123,
input_ptr[8]
)
;
float3
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
float4
)
filter_values.s0
*
row0
+
(
float4
)
filter_values.s1
*
row1
+
(
float4
)
filter_values.s2
*
row2
;
}
float
conv3x3
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
int
row_width
)
{
float3
input_value
=
vload3
(
0
,
input_ptr
)
;
float3
filter_value
=
vload3
(
0
,
filter_ptr
)
;
float3
res
=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
1
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
2
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
return
res.s0
+
res.s1
+
res.s2
;
}
mace/kernels/opencl/cl/conv_helper.h
已删除
100644 → 0
浏览文件 @
77ea99f5
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
float4
conv1x3_s1
(
const
float
*
input_ptr
,
const
float
*
filter_ptr
);
float4
conv1x3_s2
(
const
float
*
input_ptr
,
const
float
*
filter_ptr
);
float
conv3x3
(
const
float
*
input_ptr
,
const
float
*
filter_ptr
,
const
int
row_width
);
#endif // MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
浏览文件 @
f07dd516
#
include
<conv_helper.h>
#
include
<common.h>
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s1
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
vload4
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
input1
=
vload2
(
0
,
input_ptr+4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input1.s0
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s23,
input1.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s2
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,8
)
input
=
vload8
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
input.even
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
input.odd
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input_ptr[8]
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
//
Supported
data
type:
half/float
DATA_TYPE
conv3x3
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr,
const
int
row_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,3
)
input_value
=
vload3
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_value
=
vload3
(
0
,
filter_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
res
=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
1
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
2
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
return
res.s0
+
res.s1
+
res.s2
;
}
//TODO
merge
the
depthwise
with
conv
3x3
to
remove
duplicate
code.
void
kernel
depthwise_conv_3x3
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
m,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
in_chan_num,
private
const
int
out_chan_num,
private
const
int
in_height
,
private
const
int
in_width
,
private
const
int
out
_height,
private
const
int
out
_width,
private
const
int
stride_h
,
private
const
int
stride_w
)
{
__kernel
void
depthwise_conv_3x3
(
__global
const
DATA_TYPE
*input,
/*
n,
c,
h,
w
*/
__global
const
DATA_TYPE
*filter,
/*
m,
i,
kh,
kw
*/
#
ifdef
BIAS
__global
const
DATA_TYPE
*bias,
/*
o
*/
#
endif
__global
DATA_TYPE
*output,
/*
n,
c,
h,
w
*/
__private
const
int
in_chan_num
,
__private
const
int
out_chan_num
,
__private
const
int
in
_height,
__private
const
int
in
_width,
__private
const
int
out_height
,
__private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
...
...
@@ -28,38 +71,54 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
const
int
in_pixel_begin
=
out_pixel_height
*
stride_h
*
in_width
+
out_pixel_width
*
stride_w
*
4
;
#
ifdef
STRIDE_1
const
int
in_pixel_begin
=
out_pixel_height
*
in_width
+
out_pixel_width
*
4
;
#
else
const
int
in_pixel_begin
=
out_pixel_height
*
2
*
in_width
+
out_pixel_width
*
2
*
4
;
#
endif
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
float
bias_value
=
bias[i]
;
const
float
*input_ptr
=
input_base
+
(
i
/
multiplier
)
*
in_pixel
;
const
float
*filter_ptr
=
filter
+
i
*
9
;
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
DATA_TYPE
*input_ptr
=
input_base
+
(
i
/
multiplier
)
*
in_pixel
;
const
DATA_TYPE
*filter_ptr
=
filter
+
i
*
9
;
DATA_TYPE
*output_ptr
=
output_base
+
i
*
out_pixel
;
if
(
pixels
==
4
)
{
float4
res
=
(
float4
)
bias[i]
;
if
(
stride_w
==
1
)
{
#
ifdef
BIAS
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
bias[i]
;
#
else
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
=
0
;
#
endif
/*
defined
(
BIAS
)
*/
#
ifdef
STRIDE_1
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
else
{
#
else
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
#
endif
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
int
p
=
0
; p < pixels; ++p) {
float
res
=
bias[i]
;
#
ifdef
BIAS
DATA_TYPE
res
=
bias[i]
;
#
else
DATA_TYPE
res
=
0
;
#
endif
res
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
output_ptr[p]
=
res
;
input_ptr
+=
stride_w
;
#
ifdef
STRIDE_1
input_ptr
+=
1
;
#
else
input_ptr
+=
2
;
#
endif
}
}
}
...
...
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
f07dd516
float4
vec_pooling_3_s1
(
const
float
*input_ptr,
const
int
in_width
)
{
float4
row00
=
vload4
(
0
,
input_ptr
)
;
float2
row01
=
vload2
(
0
,
input_ptr
+
4
)
;
float4
row10
=
vload4
(
0
,
input_ptr
+
in_width
)
;
float2
row11
=
vload2
(
0
,
input_ptr
+
in_width
+
4
)
;
float4
row20
=
vload4
(
0
,
input_ptr
+
in_width
*
2
)
;
float2
row21
=
vload2
(
0
,
input_ptr
+
in_width
*
2
+
4
)
;
float8
data00
=
(
float8
)(
row00.s01212323
)
;
float4
data01
=
(
float4
)(
row01.s0,
row00.s3,
row01.s01
)
;
float8
data10
=
(
float8
)(
row10.s01212323
)
;
float4
data11
=
(
float4
)(
row11.s0,
row10.s3,
row11.s01
)
;
float8
data20
=
(
float8
)(
row20.s01212323
)
;
float4
data21
=
(
float4
)(
row21.s0,
row20.s3,
row21.s01
)
;
float8
left
=
fmax
(
fmax
(
data00,
data10
)
,
data20
)
;
float4
right
=
fmax
(
fmax
(
data01,
data11
)
,
data21
)
;
float4
res
=
fmax
((
float4
)(
left.s036,
right.s1
)
,
(
float4
)(
left.s147,
right.s2
))
;
res
=
fmax
(
res,
(
float4
)(
left.s25,
right.s03
))
;
#
include
<common.h>
VEC_DATA_TYPE
(
DATA_TYPE,4
)
vec_pooling_3_s1
(
const
DATA_TYPE
*input_ptr,
const
int
in_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row00
=
vload4
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
row01
=
vload2
(
0
,
input_ptr
+
4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row10
=
vload4
(
0
,
input_ptr
+
in_width
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
row11
=
vload2
(
0
,
input_ptr
+
in_width
+
4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row20
=
vload4
(
0
,
input_ptr
+
in_width
*
2
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
row21
=
vload2
(
0
,
input_ptr
+
in_width
*
2
+
4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data00
=
(
VEC_DATA_TYPE
(
DATA_TYPE,8
))(
row00.s01212323
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data01
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row01.s0,
row00.s3,
row01.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data10
=
(
VEC_DATA_TYPE
(
DATA_TYPE,8
))(
row10.s01212323
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data11
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row11.s0,
row10.s3,
row11.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data20
=
(
VEC_DATA_TYPE
(
DATA_TYPE,8
))(
row20.s01212323
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data21
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row21.s0,
row20.s3,
row21.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
left
=
fmax
(
fmax
(
data00,
data10
)
,
data20
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
right
=
fmax
(
fmax
(
data01,
data11
)
,
data21
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
=
fmax
((
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
left.s036,
right.s1
)
,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
left.s147,
right.s2
))
;
res
=
fmax
(
res,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
left.s25,
right.s03
))
;
return
res
;
}
float4
vec_pooling_3_s2
(
const
float
*input_ptr,
const
int
in_width
)
{
float8
row00
=
vload8
(
0
,
input_ptr
)
;
float
row01
=
*
(
input_ptr
+
8
)
;
float8
row10
=
vload8
(
0
,
input_ptr
+
in_width
)
;
float
row11
=
*
(
input_ptr
+
in_width
+
8
)
;
float8
row20
=
vload8
(
0
,
input_ptr
+
in_width
*
2
)
;
float
row21
=
*
(
input_ptr
+
in_width
*
2
+
8
)
;
float8
data00
=
(
float8
)(
row00.s01223445
)
;
float4
data01
=
(
float4
)(
row00.s667,
row01
)
;
float8
data10
=
(
float8
)(
row10.s01223445
)
;
float4
data11
=
(
float4
)(
row10.s667,
row11
)
;
float8
data20
=
(
float8
)(
row20.s01223445
)
;
float4
data21
=
(
float4
)(
row20.s667,
row21
)
;
float8
left
=
fmax
(
fmax
(
data00,
data10
)
,
data20
)
;
float4
right
=
fmax
(
fmax
(
data01,
data11
)
,
data21
)
;
float4
res
=
fmax
((
float4
)(
left.s036,
right.s1
)
,
(
float4
)(
left.s147,
right.s2
))
;
res
=
fmax
(
res,
(
float4
)(
left.s25,
right.s03
))
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
vec_pooling_3_s2
(
const
DATA_TYPE
*input_ptr,
const
int
in_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,8
)
row00
=
vload8
(
0
,
input_ptr
)
;
DATA_TYPE
row01
=
*
(
input_ptr
+
8
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
row10
=
vload8
(
0
,
input_ptr
+
in_width
)
;
DATA_TYPE
row11
=
*
(
input_ptr
+
in_width
+
8
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
row20
=
vload8
(
0
,
input_ptr
+
in_width
*
2
)
;
DATA_TYPE
row21
=
*
(
input_ptr
+
in_width
*
2
+
8
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data00
=
(
VEC_DATA_TYPE
(
DATA_TYPE,8
))(
row00.s01223445
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data01
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row00.s667,
row01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data10
=
(
VEC_DATA_TYPE
(
DATA_TYPE,8
))(
row10.s01223445
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data11
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row10.s667,
row11
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data20
=
(
VEC_DATA_TYPE
(
DATA_TYPE,8
))(
row20.s01223445
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data21
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row20.s667,
row21
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
left
=
fmax
(
fmax
(
data00,
data10
)
,
data20
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
right
=
fmax
(
fmax
(
data01,
data11
)
,
data21
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
=
fmax
((
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
left.s036,
right.s1
)
,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
left.s147,
right.s2
))
;
res
=
fmax
(
res,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
left.s25,
right.s03
))
;
return
res
;
}
float
inner_pooling_3
(
const
float
*input_ptr,
const
int
in_width
)
{
float3
row0
=
vload3
(
0
,
input_ptr
)
;
float3
row1
=
vload3
(
0
,
input_ptr
+
in_width
)
;
float3
row2
=
vload3
(
0
,
input_ptr
+
in_width
*
2
)
;
DATA_TYPE
inner_pooling_3
(
const
DATA_TYPE
*input_ptr,
const
int
in_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,3
)
row0
=
vload3
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
row1
=
vload3
(
0
,
input_ptr
+
in_width
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
row2
=
vload3
(
0
,
input_ptr
+
in_width
*
2
)
;
float3
data
=
fmax
(
fmax
(
row0,
row1
)
,
row2
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
data
=
fmax
(
fmax
(
row0,
row1
)
,
row2
)
;
float
res
=
fmax
(
fmax
(
data.s0,
data.s1
)
,
data.s2
)
;
DATA_TYPE
res
=
fmax
(
fmax
(
data.s0,
data.s1
)
,
data.s2
)
;
return
res
;
}
__kernel
void
pooling3
(
__global
const
float
*input,
/*
n,
c,
h,
w
*/
//
Supported
data
type:
half/float
__kernel
void
pooling3
(
__global
const
DATA_TYPE
*input,
/*
n,
c,
h,
w
*/
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
out_chan_num,
__private
const
int
out_height,
__private
const
int
out_width,
__private
const
int
stride,
__global
float
*output
)
{
__global
DATA_TYPE
*output
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
...
...
@@ -83,21 +89,21 @@ __kernel void pooling3(__global const float *input, /* n, c, h, w */
const
int
in_offset
=
batch
*
out_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
const
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
const
float
*input_ptr
=
input_base
+
i
*
in_pixel
;
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
DATA_TYPE
*input_ptr
=
input_base
+
i
*
in_pixel
;
DATA_TYPE
*output_ptr
=
output_base
+
i
*
out_pixel
;
if
(
pixels
==
4
)
{
float4
res
;
if
(
stride
==
1
)
{
VEC_DATA_TYPE
(
DATA_TYPE,4
)
res
;
#
ifdef
STRIDE_1
res
=
vec_pooling_3_s1
(
input_ptr,
in_width
)
;
}
else
{
#
else
res
=
vec_pooling_3_s2
(
input_ptr,
in_width
)
;
}
#
endif
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
int
p
=
0
; p < pixels; ++p) {
...
...
@@ -122,7 +128,8 @@ int calculate_avg_block_size(const int pos_h,
return
(
h_end
-
h_start
)
*
(
w_end
-
w_start
)
;
}
__kernel
void
poolingn
(
__global
const
float
*input,
/*
n,
c,
h,
w
*/
//
Supported
data
type:
half/float
__kernel
void
poolingn
(
__global
const
DATA_TYPE
*input,
/*
n,
c,
h,
w
*/
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
out_chan_num,
...
...
@@ -132,7 +139,7 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
__private
const
int
pad_h,
__private
const
int
pad_w,
__private
const
int
pooling_size,
__global
float
*output
)
{
__global
DATA_TYPE
*output
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_idx
=
get_global_id
(
1
)
;
int
out_pixel_idx
=
get_global_id
(
2
)
;
...
...
@@ -150,8 +157,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
const
int
in_offset
=
batch
*
out_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_idx
;
float
*output_base
=
output
+
out_offset
+
out_pixel_idx
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_idx
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_idx
;
const
int
block_size
=
calculate_avg_block_size
(
out_pixel_height
*
stride,
...
...
@@ -162,14 +169,14 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
in_height
-
pad_h,
in_width
-
pad_w
)
;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
float8
sum8
=
0.0f
;
float
sum1
=
0.0f
;
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
sum8
=
0.0f
;
DATA_TYPE
sum1
=
0.0f
;
DATA_TYPE
*output_ptr
=
output_base
+
i
*
out_pixel
;
for
(
int
y
=
0
; y < pooling_size; ++y) {
const
float
*input_ptr
=
input_base
+
i
*
in_pixel
+
y
*
in_width
;
const
DATA_TYPE
*input_ptr
=
input_base
+
i
*
in_pixel
+
y
*
in_width
;
int
x
=
0
;
for
(
; x < (pooling_size-8); x += 8) {
float8
data
=
vload8
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,8
)
data
=
vload8
(
0
,
input_ptr
)
;
sum8
+=
data
;
input_ptr
+=
8
;
}
...
...
@@ -178,8 +185,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
input_ptr++
;
}
}
float4
sum4
=
sum8.s0123
+
sum8.s4567
;
float2
sum2
=
sum4.s01
+
sum4.s23
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
sum4
=
sum8.s0123
+
sum8.s4567
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
sum2
=
sum4.s01
+
sum4.s23
;
*output_ptr
=
(
sum2.s0
+
sum2.s1
+
sum1
)
/
block_size
;
}
...
...
mace/kernels/opencl/cl/relu.cl
浏览文件 @
f07dd516
__kernel
void
relu
(
__global
const
float
*input,
#
include
<common.h>
//
Supported
data
type:
half/float
__kernel
void
relu
(
__global
const
DATA_TYPE
*input,
__private
const
int
size,
__global
float
*output
)
{
__global
DATA_TYPE
*output
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
+
4
>
size
)
{
...
...
@@ -8,16 +11,16 @@ __kernel void relu(__global const float *input,
*
(
output+idx
)
=
fmax
(
*
(
input+idx
)
,
0
)
;
}
}
else
{
float4
data
=
vload4
(
idx,
input
)
;
data
=
fmax
(
data,
(
float4
)
0
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data
=
vload4
(
idx,
input
)
;
data
=
fmax
(
data,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
)
)
0
)
;
vstore4
(
data,
idx,
output
)
;
}
}
__kernel
void
relux
(
__global
const
float
*input,
__private
const
float
max_limit,
__kernel
void
relux
(
__global
const
DATA_TYPE
*input,
__private
const
DATA_TYPE
max_limit,
__private
const
int
size,
__global
float
*output
)
{
__global
DATA_TYPE
*output
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
+
4
>
size
)
{
...
...
@@ -25,8 +28,8 @@ __kernel void relux(__global const float *input,
*
(
output+idx
)
=
clamp
(
*
(
input+idx
)
,
0.0f,
max_limit
)
;
}
}
else
{
float4
data
=
vload4
(
idx,
input
)
;
data
=
clamp
(
data,
(
float4
)
0
,
(
float4
)
max_limit
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
data
=
vload4
(
idx,
input
)
;
data
=
clamp
(
data,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
0
,
(
VEC_DATA_TYPE
(
DATA_TYPE,4
)
)
max_limit
)
;
vstore4
(
data,
idx,
output
)
;
}
}
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
f07dd516
__kernel
void
resize_bilinear_nocache
(
__global
const
float
*input,
/*
n
*
c,
h,
w
*/
__global
float
*output
/*
n
*
c,
h,
w
*/,
#
include
<common.h>
//
Supported
data
type:
half/float
__kernel
void
resize_bilinear_nocache
(
__global
const
DATA_TYPE
*input,
/*
n
*
c,
h,
w
*/
__global
DATA_TYPE
*output
/*
n
*
c,
h,
w
*/,
__private
const
float
height_scale,
__private
const
float
width_scale,
__private
const
int
in_height,
...
...
@@ -21,16 +24,16 @@ __kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h,
const
float
h_lerp
=
h_in
-
h_lower
;
const
float
w_lerp
=
w_in
-
w_lower
;
const
float
*input_base
=
input
+
c
*
in_height
*
in_width
;
float
*output_base
=
output
+
c
*
height
*
width
;
const
DATA_TYPE
*input_base
=
input
+
c
*
in_height
*
in_width
;
DATA_TYPE
*output_base
=
output
+
c
*
height
*
width
;
float
top_left
=
input_base[h_lower
*
in_width
+
w_lower]
;
float
top_right
=
input_base[h_lower
*
in_width
+
w_upper]
;
float
bottom_left
=
input_base[h_upper
*
in_width
+
w_lower]
;
float
bottom_right
=
input_base[h_upper
*
in_width
+
w_upper]
;
DATA_TYPE
top_left
=
input_base[h_lower
*
in_width
+
w_lower]
;
DATA_TYPE
top_right
=
input_base[h_lower
*
in_width
+
w_upper]
;
DATA_TYPE
bottom_left
=
input_base[h_upper
*
in_width
+
w_lower]
;
DATA_TYPE
bottom_right
=
input_base[h_upper
*
in_width
+
w_upper]
;
const
float
top
=
top_left
+
(
top_right
-
top_left
)
*
w_lerp
;
const
float
bottom
=
bottom_left
+
(
bottom_right
-
bottom_left
)
*
w_lerp
;
const
DATA_TYPE
top
=
top_left
+
(
top_right
-
top_left
)
*
w_lerp
;
const
DATA_TYPE
bottom
=
bottom_left
+
(
bottom_right
-
bottom_left
)
*
w_lerp
;
output_base[h
*
width
+
w]
=
top
+
(
bottom
-
top
)
*
h_lerp
;
}
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
f07dd516
void
kernel
space_to_batch
(
global
float
*space_data_ptr,
global
const
int
*block_shape_ptr,
global
const
int
*paddings_ptr,
private
const
int
space_batch,
private
const
int
space_channel,
private
const
int
space_height,
private
const
int
space_width,
private
const
int
batch_height,
private
const
int
batch_width,
private
const
int
b2s,
global
float*
batch_data_ptr
)
{
#
include
<common.h>
//
Supported
data
type:
all
__kernel
void
space_to_batch
(
__global
DATA_TYPE
*space_data_ptr,
__global
const
int
*block_shape_ptr,
__global
const
int
*paddings_ptr,
__private
const
int
space_batch,
__private
const
int
space_channel,
__private
const
int
space_height,
__private
const
int
space_width,
__private
const
int
batch_height,
__private
const
int
batch_width,
__private
const
int
b2s,
__global
DATA_TYPE*
batch_data_ptr
)
{
int
batch_idx
=
get_global_id
(
0
)
;
int
batch_channel_idx
=
get_global_id
(
1
)
;
int
batch_pixel_idx
=
get_global_id
(
2
)
;
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
f07dd516
...
...
@@ -61,16 +61,19 @@ void Conv1x1V2(const Tensor *input,
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event
auto
conv_2d_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_1x1_v2"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
stride
==
1
?
"-DSTRIDE_1"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
conv_2d_kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
"conv_2d_1x1_v2"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
if
(
bias
==
NULL
)
{
conv_2d_kernel
.
setArg
(
idx
++
,
NULL
);
}
else
{
if
(
bias
!=
nullptr
)
{
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
}
...
...
@@ -81,7 +84,6 @@ void Conv1x1V2(const Tensor *input,
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
stride
);
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
=
command_queue
.
enqueueNDRangeKernel
(
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
f07dd516
...
...
@@ -22,14 +22,17 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_3x3"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
stride
==
1
?
"-DSTRIDE_1"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
conv_kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
"conv_2d_3x3"
,
built_options
);
uint32_t
idx
=
0
;
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
if
(
bias
==
nullptr
)
{
conv_kernel
.
setArg
(
idx
++
,
NULL
);
}
else
{
if
(
bias
!=
nullptr
)
{
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
}
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
...
...
@@ -39,8 +42,6 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
3
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
height
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
width
));
conv_kernel
.
setArg
(
idx
++
,
stride
);
conv_kernel
.
setArg
(
idx
++
,
stride
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
pixel_blocks
)};
...
...
mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
浏览文件 @
f07dd516
...
...
@@ -30,13 +30,18 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const
index_t
pixel_blocks
=
(
width
+
3
)
/
4
*
height
;
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_kernel
=
cl
::
Kernel
(
program
,
"depthwise_conv_3x3"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
stride
==
1
?
"-DSTRIDE_1"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
conv_kernel
=
runtime
->
BuildKernel
(
"depthwise_conv_3x3"
,
"depthwise_conv_3x3"
,
built_options
);
uint32_t
idx
=
0
;
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
if
(
bias
!=
nullptr
)
{
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
}
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channels
));
...
...
@@ -44,8 +49,6 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
3
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
height
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
width
));
conv_kernel
.
setArg
(
idx
++
,
stride
);
conv_kernel
.
setArg
(
idx
++
,
stride
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
f07dd516
...
...
@@ -30,24 +30,26 @@ static void Pooling3(const Tensor *input,
};
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
stride
[
0
]
==
1
?
"-DSTRIDE_1"
:
""
);
auto
pooling_kernel
=
runtime
->
BuildKernel
(
"pooling"
,
"pooling3"
,
built_options
);
auto
max_pooling_kernel
=
cl
::
Kernel
(
program
,
"pooling3"
);
const
uint32_t
lws
[
3
]
=
{
1
,
8
,
128
};
uint32_t
idx
=
0
;
max_
pooling_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
max_
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
max_
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
3
)));
max_
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channels
));
max_
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_height
));
max_
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_width
));
max_
pooling_kernel
.
setArg
(
idx
++
,
stride
[
0
]);
max_
pooling_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
pooling_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
3
)));
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channels
));
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_height
));
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_width
));
pooling_kernel
.
setArg
(
idx
++
,
stride
[
0
]);
pooling_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
max_
pooling_kernel
,
cl
::
NullRange
,
pooling_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]));
MACE_CHECK
(
error
==
CL_SUCCESS
);
...
...
@@ -75,9 +77,9 @@ static void PoolingN(const Tensor *input,
};
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
()
;
auto
pooling_kernel
=
cl
::
Kernel
(
program
,
"poolingn"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
auto
pooling_kernel
=
runtime
->
BuildKernel
(
"pooling"
,
"poolingn"
,
built_options
);
const
uint32_t
lws
[
3
]
=
{
1
,
8
,
128
};
...
...
mace/kernels/opencl/relu_opencl.cc
浏览文件 @
f07dd516
...
...
@@ -21,9 +21,10 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
if
(
max_limit_
<
0
)
{
auto
relu_kernel
=
cl
::
Kernel
(
program
,
"relu"
);
auto
relu_kernel
=
runtime
->
BuildKernel
(
"relu"
,
"relu"
,
built_options
);
const
uint32_t
lws
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
uint32_t
idx
=
0
;
...
...
@@ -37,7 +38,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
cl
::
NDRange
(
lws
));
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
else
{
auto
relu_kernel
=
cl
::
Kernel
(
program
,
"relux"
);
auto
relu_kernel
=
runtime
->
BuildKernel
(
"relu"
,
"relux"
,
built_options
);
const
uint32_t
lws
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
f07dd516
...
...
@@ -29,9 +29,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
float
width_scale
=
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
auto
rb_kernel
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
"resize_bilinear_nocache"
,
built_options
);
auto
rb_kernel
=
cl
::
Kernel
(
program
,
"resize_bilinear_nocache"
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
rb_kernel
);
uint32_t
idx
=
0
;
rb_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
...
...
mace/kernels/opencl/space_to_batch_opecl.cc
浏览文件 @
f07dd516
...
...
@@ -18,9 +18,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
const
Tensor
*
paddings_tensor
,
Tensor
*
batch_tensor
)
{
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
()
;
auto
s2b_kernel
=
cl
::
Kernel
(
program
,
"space_to_batch"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
space_tensor
->
dtype
())
);
auto
s2b_kernel
=
runtime
->
BuildKernel
(
"space_to_batch"
,
"space_to_batch"
,
built_options
);
uint32_t
idx
=
0
;
s2b_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
space_tensor
->
buffer
())));
...
...
mace/proto/mace.proto
浏览文件 @
f07dd516
...
...
@@ -24,6 +24,7 @@ enum DataType {
DT_UINT16
=
9
;
DT_BOOL
=
10
;
DT_HALF
=
19
;
DT_UINT32
=
22
;
}
message
TensorProto
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录