Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
88120708
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
88120708
编写于
7月 28, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add gpu avalibility check and return status to user if gpu call failed.
上级
d9a58a5e
变更
47
展开全部
隐藏空白更改
内联
并排
Showing
47 changed file
with
1161 addition
and
560 deletion
+1161
-560
mace/core/file_storage.cc
mace/core/file_storage.cc
+2
-2
mace/core/runtime/opencl/opencl_allocator.cc
mace/core/runtime/opencl/opencl_allocator.cc
+11
-4
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+105
-28
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+21
-6
mace/core/runtime/opencl/opencl_wrapper.cc
mace/core/runtime/opencl/opencl_wrapper.cc
+319
-161
mace/core/workspace.cc
mace/core/workspace.cc
+22
-20
mace/examples/cli/example.cc
mace/examples/cli/example.cc
+4
-1
mace/kernels/opencl/activation.cc
mace/kernels/opencl/activation.cc
+4
-2
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+4
-2
mace/kernels/opencl/batch_norm.cc
mace/kernels/opencl/batch_norm.cc
+4
-2
mace/kernels/opencl/bias_add.cc
mace/kernels/opencl/bias_add.cc
+3
-2
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+5
-3
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+5
-3
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+19
-10
mace/kernels/opencl/conv_2d_1x1.cc
mace/kernels/opencl/conv_2d_1x1.cc
+31
-23
mace/kernels/opencl/conv_2d_3x3.cc
mace/kernels/opencl/conv_2d_3x3.cc
+27
-20
mace/kernels/opencl/conv_2d_general.cc
mace/kernels/opencl/conv_2d_general.cc
+31
-24
mace/kernels/opencl/crop.cc
mace/kernels/opencl/crop.cc
+16
-8
mace/kernels/opencl/deconv_2d_opencl.cc
mace/kernels/opencl/deconv_2d_opencl.cc
+4
-2
mace/kernels/opencl/depth_to_space.cc
mace/kernels/opencl/depth_to_space.cc
+6
-3
mace/kernels/opencl/depthwise_conv.cc
mace/kernels/opencl/depthwise_conv.cc
+29
-21
mace/kernels/opencl/eltwise.cc
mace/kernels/opencl/eltwise.cc
+4
-2
mace/kernels/opencl/fully_connected.cc
mace/kernels/opencl/fully_connected.cc
+8
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+38
-27
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+11
-11
mace/kernels/opencl/image_to_buffer.cc
mace/kernels/opencl/image_to_buffer.cc
+6
-4
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+4
-2
mace/kernels/opencl/out_of_range_check_test.cc
mace/kernels/opencl/out_of_range_check_test.cc
+11
-4
mace/kernels/opencl/pad.cc
mace/kernels/opencl/pad.cc
+4
-2
mace/kernels/opencl/pooling.cc
mace/kernels/opencl/pooling.cc
+22
-13
mace/kernels/opencl/reduce_mean_opencl.cc
mace/kernels/opencl/reduce_mean_opencl.cc
+8
-4
mace/kernels/opencl/resize_bilinear.cc
mace/kernels/opencl/resize_bilinear.cc
+28
-19
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+5
-2
mace/kernels/opencl/softmax.cc
mace/kernels/opencl/softmax.cc
+18
-12
mace/kernels/opencl/space_to_batch.cc
mace/kernels/opencl/space_to_batch.cc
+6
-3
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+12
-6
mace/libmace/mace.cc
mace/libmace/mace.cc
+44
-0
mace/proto/mace.proto
mace/proto/mace.proto
+9
-2
mace/public/mace_runtime.h
mace/public/mace_runtime.h
+76
-51
mace/python/tools/convert_util.py
mace/python/tools/convert_util.py
+65
-0
mace/python/tools/converter.py
mace/python/tools/converter.py
+7
-0
mace/python/tools/converter_tool/base_converter.py
mace/python/tools/converter_tool/base_converter.py
+1
-0
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+20
-12
mace/python/tools/memory_optimizer.py
mace/python/tools/memory_optimizer.py
+77
-31
mace/python/tools/model.jinja2
mace/python/tools/model.jinja2
+1
-0
mace/test/mace_api_mt_test.cc
mace/test/mace_api_mt_test.cc
+2
-0
mace/test/mace_api_test.cc
mace/test/mace_api_test.cc
+2
-0
未找到文件。
mace/core/file_storage.cc
浏览文件 @
88120708
...
...
@@ -37,8 +37,8 @@ int FileStorage::Load() {
struct
stat
st
;
if
(
stat
(
file_path_
.
c_str
(),
&
st
)
==
-
1
)
{
if
(
errno
==
ENOENT
)
{
LOG
(
INFO
)
<<
"File "
<<
file_path_
<<
" does not exist"
;
VLOG
(
1
)
<<
"File "
<<
file_path_
<<
" does not exist"
;
return
0
;
}
else
{
LOG
(
WARNING
)
<<
"Stat file "
<<
file_path_
...
...
mace/core/runtime/opencl/opencl_allocator.cc
浏览文件 @
88120708
...
...
@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void
*
mapped_ptr
=
queue
.
enqueueMapBuffer
(
*
cl_buffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
offset
,
nbytes
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Map buffer failed, error: "
<<
OpenCLErrorToString
(
error
);
mapped_ptr
=
nullptr
;
}
return
mapped_ptr
;
}
...
...
@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
*
cl_image
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
origin
,
region
,
mapped_image_pitch
->
data
(),
mapped_image_pitch
->
data
()
+
1
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Map Image failed, error: "
<<
OpenCLErrorToString
(
error
);
mapped_ptr
=
nullptr
;
}
return
mapped_ptr
;
}
...
...
@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto
queue
=
OpenCLRuntime
::
Global
()
->
command_queue
();
cl_int
error
=
queue
.
enqueueUnmapMemObject
(
*
cl_buffer
,
mapped_ptr
,
nullptr
,
nullptr
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Unmap buffer failed, error: "
<<
OpenCLErrorToString
(
error
);
}
}
bool
OpenCLAllocator
::
OnHost
()
const
{
return
false
;
}
...
...
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
88120708
...
...
@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
OpenCLRuntime
::
OpenCLRuntime
()
:
precompiled_binary_storage_
(
nullptr
),
cache_storage_
(
nullptr
),
is_profiling_enabled_
(
false
)
{
is_opencl_avaliable_
(
false
),
is_profiling_enabled_
(
false
),
opencl_version_
(
CL_VER_UNKNOWN
),
gpu_type_
(
UNKNOWN
)
{
std
::
vector
<
cl
::
Platform
>
all_platforms
;
cl
::
Platform
::
get
(
&
all_platforms
);
if
(
all_platforms
.
size
()
==
0
)
{
LOG
(
FATAL
)
<<
"No OpenCL platforms found"
;
LOG
(
ERROR
)
<<
"No OpenCL platforms found"
;
return
;
}
cl
::
Platform
default_platform
=
all_platforms
[
0
];
std
::
stringstream
ss
;
...
...
@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
std
::
vector
<
cl
::
Device
>
all_devices
;
default_platform
.
getDevices
(
CL_DEVICE_TYPE_ALL
,
&
all_devices
);
if
(
all_devices
.
size
()
==
0
)
{
LOG
(
FATAL
)
<<
"No OpenCL devices found"
;
LOG
(
ERROR
)
<<
"No OpenCL devices found"
;
return
;
}
bool
gpu_detected
=
false
;
...
...
@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
const
std
::
string
device_version
=
device
.
getInfo
<
CL_DEVICE_VERSION
>
();
opencl_version_
=
ParseDeviceVersion
(
device_version
);
if
(
opencl_version_
==
OpenCLVersion
::
CL_VER_UNKNOWN
)
{
return
;
}
VLOG
(
1
)
<<
"Using device: "
<<
device_name
;
break
;
}
}
if
(
!
gpu_detected
)
{
LOG
(
FATAL
)
<<
"No GPU device found"
;
LOG
(
ERROR
)
<<
"No GPU device found"
;
return
;
}
cl_command_queue_properties
properties
=
0
;
...
...
@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
new
cl
::
Context
({
*
device_
},
nullptr
,
nullptr
,
nullptr
,
&
err
));
}
}
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
;
}
command_queue_
=
std
::
make_shared
<
cl
::
CommandQueue
>
(
*
context_
,
*
device_
,
properties
,
&
err
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
;
}
extern
std
::
shared_ptr
<
KVStorageFactory
>
kStorageFactory
;
std
::
string
cached_binary_platform_info
;
...
...
@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
}
if
(
cached_binary_platform_info
!=
platform_info_
)
{
if
(
OpenCLRuntime
::
kPrecompiledBinaryPath
.
empty
())
{
LOG
(
WARNING
)
<<
"There is no precompiled OpenCL binary in"
" all OpenCL binary paths"
;
}
else
{
if
(
!
OpenCLRuntime
::
kPrecompiledBinaryPath
.
empty
())
{
precompiled_binary_storage_
.
reset
(
new
FileStorage
(
OpenCLRuntime
::
kPrecompiledBinaryPath
));
if
(
precompiled_binary_storage_
->
Load
()
!=
0
)
{
...
...
@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
}
else
{
this
->
out_of_range_check_
=
false
;
}
is_opencl_avaliable_
=
true
;
}
OpenCLRuntime
::~
OpenCLRuntime
()
{
...
...
@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
device_
.
reset
();
}
bool
OpenCLRuntime
::
is_opencl_avaliable
()
{
static
const
uint64_t
kMinWorkGroupSize
=
64
;
return
is_opencl_avaliable_
&&
GetDeviceMaxWorkGroupSize
()
>=
kMinWorkGroupSize
;
}
cl
::
Context
&
OpenCLRuntime
::
context
()
{
return
*
context_
;
}
cl
::
Device
&
OpenCLRuntime
::
device
()
{
return
*
device_
;
}
...
...
@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return
true
;
}
void
OpenCLRuntime
::
BuildProgramFromSource
(
bool
OpenCLRuntime
::
BuildProgramFromSource
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
...
...
@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
LOG
(
WARNING
)
<<
"Build program "
<<
program_name
<<
" from source failed: "
<<
MakeString
(
ret
);
return
;
return
false
;
}
// Keep built program binary
...
...
@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
cl_int
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARY_SIZES
,
sizeof
(
size_t
)
*
device_list_size
,
program_binary_sizes
.
get
(),
nullptr
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
std
::
unique_ptr
<
std
::
unique_ptr
<
unsigned
char
[]
>
[]
>
program_binaries
(
new
std
::
unique_ptr
<
unsigned
char
[]
>
[
device_list_size
]);
for
(
cl_uint
i
=
0
;
i
<
device_list_size
;
++
i
)
{
...
...
@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARIES
,
sizeof
(
unsigned
char
*
)
*
device_list_size
,
program_binaries
.
get
(),
nullptr
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
std
::
vector
<
unsigned
char
>
content
(
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
()),
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
())
+
...
...
@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
VLOG
(
3
)
<<
"Program from source: "
<<
built_program_key
;
}
return
true
;
}
void
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_name
,
bool
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
)
{
...
...
@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
ret
=
BuildProgramFromPrecompiledBinary
(
built_program_key
,
build_options_str
,
program
);
if
(
!
ret
)
{
BuildProgramFromSource
(
program_name
,
built_program_key
,
build_options_str
,
program
);
ret
=
BuildProgramFromSource
(
program_name
,
built_program_key
,
build_options_str
,
program
);
}
}
return
ret
;
}
cl
::
Kernel
OpenCLRuntime
::
BuildKernel
(
MaceStatus
OpenCLRuntime
::
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
)
{
const
std
::
set
<
std
::
string
>
&
build_options
,
cl
::
Kernel
*
kernel
)
{
std
::
string
build_options_str
;
for
(
auto
&
option
:
build_options
)
{
build_options_str
+=
" "
+
option
;
...
...
@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if
(
built_program_it
!=
built_program_map_
.
end
())
{
program
=
built_program_it
->
second
;
}
else
{
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
&
program
);
bool
ret
=
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
&
program
);
if
(
!
ret
)
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
built_program_map_
.
emplace
(
built_program_key
,
program
);
}
return
cl
::
Kernel
(
program
,
kernel_name
.
c_str
());
cl_int
err
;
*
kernel
=
cl
::
Kernel
(
program
,
kernel_name
.
c_str
(),
&
err
);
MACE_CL_RET_STATUS
(
err
);
return
MaceStatus
::
MACE_SUCCESS
;
}
void
OpenCLRuntime
::
SaveBuiltCLProgram
()
{
...
...
@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
uint64_t
OpenCLRuntime
::
GetDeviceMaxWorkGroupSize
()
{
uint64_t
size
=
0
;
device_
->
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
uint64_t
OpenCLRuntime
::
GetDeviceMaxMemAllocSize
()
{
uint64_t
size
=
0
;
device_
->
getInfo
(
CL_DEVICE_MAX_MEM_ALLOC_SIZE
,
&
size
);
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_MAX_MEM_ALLOC_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
bool
OpenCLRuntime
::
IsImageSupport
()
{
cl_bool
res
;
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE_SUPPORT
,
&
res
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
return
res
==
CL_TRUE
;
}
std
::
vector
<
uint64_t
>
OpenCLRuntime
::
GetMaxImage2DSize
()
{
size_t
max_height
,
max_width
;
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE2D_MAX_HEIGHT
,
&
max_height
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
{};
}
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE2D_MAX_WIDTH
,
&
max_width
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
{};
}
return
{
max_height
,
max_width
};
}
uint64_t
OpenCLRuntime
::
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
size
=
0
;
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
cl_int
err
=
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
uint64_t
OpenCLRuntime
::
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
size
=
0
;
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WAVE_SIZE_QCOM
,
&
size
);
cl_int
err
=
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WAVE_SIZE_QCOM
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
...
...
@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
}
else
if
(
words
[
1
]
==
"1.0"
)
{
return
OpenCLVersion
::
CL_VER_1_0
;
}
else
{
LOG
(
FATAL
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
return
OpenCLVersion
::
CL_VER_
1_0
;
LOG
(
ERROR
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
return
OpenCLVersion
::
CL_VER_
UNKNOWN
;
}
}
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
88120708
...
...
@@ -42,13 +42,23 @@ enum OpenCLVersion {
CL_VER_1_1
,
CL_VER_1_2
,
CL_VER_2_0
,
CL_VER_UNKNOWN
,
};
const
std
::
string
OpenCLErrorToString
(
cl_int
error
);
#define MACE_CHECK_CL_SUCCESS(error) \
MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error)
#define MACE_CL_RET_ERROR(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return error; \
}
#define MACE_CL_RET_STATUS(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return MaceStatus::MACE_OUT_OF_RESOURCES; \
}
class
OpenCLProfilingTimer
:
public
Timer
{
public:
...
...
@@ -81,19 +91,23 @@ class OpenCLRuntime {
const
std
::
string
platform_info
()
const
;
uint64_t
device_global_mem_cache_size
()
const
;
uint32_t
device_compute_units
()
const
;
bool
is_opencl_avaliable
();
void
GetCallStats
(
const
cl
::
Event
&
event
,
CallStats
*
stats
);
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetDeviceMaxMemAllocSize
();
bool
IsImageSupport
();
std
::
vector
<
uint64_t
>
GetMaxImage2DSize
();
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
bool
IsNonUniformWorkgroupsSupported
()
const
;
bool
IsOutOfRangeCheckEnabled
()
const
;
bool
is_profiling_enabled
()
const
;
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
MaceStatus
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
const
std
::
set
<
std
::
string
>
&
build_options
,
cl
::
Kernel
*
kernel
);
void
SaveBuiltCLProgram
();
...
...
@@ -103,7 +117,7 @@ class OpenCLRuntime {
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
void
BuildProgram
(
const
std
::
string
&
program_file_name
,
bool
BuildProgram
(
const
std
::
string
&
program_file_name
,
const
std
::
string
&
binary_file_name
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
);
...
...
@@ -115,7 +129,7 @@ class OpenCLRuntime {
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
cl
::
Program
*
program
);
void
BuildProgramFromSource
(
bool
BuildProgramFromSource
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
...
...
@@ -125,6 +139,7 @@ class OpenCLRuntime {
private:
std
::
unique_ptr
<
KVStorage
>
precompiled_binary_storage_
;
std
::
unique_ptr
<
KVStorage
>
cache_storage_
;
bool
is_opencl_avaliable_
;
bool
is_profiling_enabled_
;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
...
...
mace/core/runtime/opencl/opencl_wrapper.cc
浏览文件 @
88120708
此差异已折叠。
点击以展开。
mace/core/workspace.cc
浏览文件 @
88120708
...
...
@@ -204,26 +204,28 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
if
(
device_type
==
DeviceType
::
GPU
)
{
// TODO(liuqi): refactor based on PB
if
(
mem_block
.
mem_id
()
>=
20000
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
());
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
}
else
{
if
(
mem_block
.
mem_id
()
<
20000
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
device_type
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
if
(
mem_block
.
mem_type
()
==
MemoryType
::
CPU_BUFFER
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
CPU
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
else
if
(
mem_block
.
mem_type
()
==
MemoryType
::
GPU_IMAGE
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
());
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
else
if
(
mem_block
.
mem_type
()
==
MemoryType
::
GPU_BUFFER
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
}
VLOG
(
3
)
<<
"Preallocate buffer to tensors"
;
...
...
mace/examples/cli/example.cc
浏览文件 @
88120708
...
...
@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
#endif
if
(
create_engine_status
!=
MaceStatus
::
MACE_SUCCESS
)
{
std
::
cerr
<<
"Create engine error, please check the arguments"
<<
std
::
endl
;
std
::
cerr
<<
"Create engine error, please check the arguments first, "
<<
"if correct, the device may not run the model, "
<<
"please fall back to other strategy."
<<
std
::
endl
;
exit
(
1
);
}
...
...
mace/kernels/opencl/activation.cc
浏览文件 @
88120708
...
...
@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
88120708
...
...
@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"addn_opencl_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/batch_norm.cc
浏览文件 @
88120708
...
...
@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/bias_add.cc
浏览文件 @
88120708
...
...
@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
88120708
...
...
@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
cl
::
Kernel
b2f_kernel
;
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
));
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
88120708
...
...
@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
88120708
...
...
@@ -24,12 +24,18 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
}
...
...
@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if
(
input0
->
dim
(
3
)
%
4
==
0
)
{
built_options
.
emplace
(
"-DDIVISIBLE_FOUR"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
...
...
@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/conv_2d_1x1.cc
浏览文件 @
88120708
...
...
@@ -27,30 +27,36 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
const
uint32_t
lws_limit
=
128
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
if
((
1
<
lws
[
1
]
&&
lws
[
1
]
<
base
)
&&
gws
[
0
]
>=
lws_limit
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
if
((
1
<
lws
[
1
]
&&
lws
[
1
]
<
base
)
&&
gws
[
0
]
>=
lws_limit
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
);
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/conv_2d_3x3.cc
浏览文件 @
88120708
...
...
@@ -26,25 +26,30 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
5
+
4
+
5
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
),
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
),
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/conv_2d_general.cc
浏览文件 @
88120708
...
...
@@ -30,30 +30,35 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const
uint32_t
kernel_size
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
gws
[
2
]
<
lws_limit
)
{
lws
[
2
]
=
gws
[
2
];
}
else
{
lws
[
2
]
=
base
;
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
gws
[
2
]
<
lws_limit
)
{
lws
[
2
]
=
gws
[
2
];
}
else
{
lws
[
2
]
=
base
;
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
2
),
filter
->
dim
(
3
));
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
filter
->
dim
(
2
)
*
filter
->
dim
(
3
),
*
kwg_size
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/crop.cc
浏览文件 @
88120708
...
...
@@ -24,12 +24,18 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
}
...
...
@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"crop"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"crop"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"crop_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/deconv_2d_opencl.cc
浏览文件 @
88120708
...
...
@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"deconv2d_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depth_to_space.cc
浏览文件 @
88120708
...
...
@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
}
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depthwise_conv.cc
浏览文件 @
88120708
...
...
@@ -26,27 +26,33 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
1
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
]
/
8
,
kwg_size
/
lws
[
1
]);
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
),
kwg_size
/
lws
[
1
]);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
]
/
8
,
kwg_size
/
lws
[
1
]);
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
),
kwg_size
/
lws
[
1
]);
}
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]),
1
);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]),
1
);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/eltwise.cc
浏览文件 @
88120708
...
...
@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
std
::
string
tuning_key
=
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/fully_connected.cc
浏览文件 @
88120708
...
...
@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
,
kernel
)
);
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
...
...
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
,
kernel
));
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
88120708
...
...
@@ -245,23 +245,27 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
}
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
MaceStatus
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
...
...
@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
MACE_CHECK
(
params
[
i
]
!=
0
);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
...
...
@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
}
}
else
{
timer
->
ClearTiming
();
...
...
@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
internal_gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
...
@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
}
}
...
...
@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
cl_int
err
=
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
MACE_CL_RET_STATUS
(
err
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
event
](
CallStats
*
stats
)
{
...
...
@@ -388,13 +394,14 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
};
}
return
MaceStatus
::
MACE_SUCCESS
;
}
void
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
MaceStatus
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
...
...
@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
MACE_CHECK
(
params
[
i
]
!=
0
);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
...
...
@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
}
}
else
{
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
...
@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
}
}
...
...
@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
cl_int
err
=
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
MACE_CL_RET_STATUS
(
err
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
};
}
return
MaceStatus
::
MACE_SUCCESS
;
}
}
// namespace kernels
...
...
mace/kernels/opencl/helper.h
浏览文件 @
88120708
...
...
@@ -65,17 +65,17 @@ std::string DtToCLDt(const DataType dt);
std
::
string
DtToUpstreamCLDt
(
const
DataType
dt
);
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
void
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
MaceStatus
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
MaceStatus
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
inline
void
SetFuture
(
StatsFuture
*
future
,
const
cl
::
Event
&
event
)
{
if
(
future
!=
nullptr
)
{
...
...
mace/kernels/opencl/image_to_buffer.cc
浏览文件 @
88120708
...
...
@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
kernel_error_
->
UnMap
();
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
cl
::
Kernel
b2f_kernel
;
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
));
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
88120708
...
...
@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
batch
,
height
,
width
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/out_of_range_check_test.cc
浏览文件 @
88120708
...
...
@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
kernel_error
->
UnMap
();
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
cl
::
Kernel
b2f_kernel
;
cl_int
error
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
);
if
(
error
!=
CL_SUCCESS
)
{
return
false
;
}
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
cl
::
Event
event
;
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
...
...
@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
return
false
;
}
runtime
->
command_queue
().
finish
();
bool
is_out_of_range
=
false
;
...
...
mace/kernels/opencl/pad.cc
浏览文件 @
88120708
...
...
@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"pad"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"pad"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/pooling.cc
浏览文件 @
88120708
...
...
@@ -25,18 +25,23 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std
::
string
tuning_key
=
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/reduce_mean_opencl.cc
浏览文件 @
88120708
...
...
@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"reduce_mean"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"reduce_mean"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
...
...
@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_CL_RET_STATUS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
mace/kernels/opencl/resize_bilinear.cc
浏览文件 @
88120708
...
...
@@ -25,25 +25,30 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
gws
[
2
]
/
8
;
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
gws
[
2
]
/
8
;
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
88120708
...
...
@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/softmax.cc
浏览文件 @
88120708
...
...
@@ -25,19 +25,23 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
lws
[
0
]
=
gws
[
0
];
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
gws
[
0
]
/
base
;
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
lws
[
0
]
=
gws
[
0
];
}
else
{
lws
[
0
]
=
gws
[
0
]
/
base
;
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
1
);
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
1
);
return
lws
;
}
...
...
@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"softmax_opencl_kernel"
,
batch
,
height
,
width
,
channels
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/space_to_batch.cc
浏览文件 @
88120708
...
...
@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
kernel_name
,
batch_tensor
->
dim
(
0
),
batch_tensor
->
dim
(
1
),
batch_tensor
->
dim
(
2
),
batch_tensor
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
88120708
...
...
@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat
(
"winograd_inverse_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
),
input_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/libmace/mace.cc
浏览文件 @
88120708
...
...
@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
MACE_CHECK
(
ret
==
0
,
"Failed to unmap model data file, error code: "
,
strerror
(
errno
));
}
#ifdef MACE_ENABLE_OPENCL
MaceStatus
CheckGPUAvalibility
(
const
NetDef
*
net_def
)
{
// Check OpenCL avaliable
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
!
runtime
->
is_opencl_avaliable
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
// Check whether model max OpenCL image sizes exceed OpenCL limitation.
if
(
net_def
==
nullptr
)
{
return
MaceStatus
::
MACE_INVALID_ARGS
;
}
if
(
!
runtime
->
IsImageSupport
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
auto
opencl_max_image_size
=
runtime
->
GetMaxImage2DSize
();
if
(
opencl_max_image_size
.
empty
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
const
std
::
vector
<
int64_t
>
net_max_image_size
=
ProtoArgHelper
::
GetRepeatedArgs
<
NetDef
,
int64_t
>
(
*
net_def
,
"opencl_max_image_size"
,
{
0
,
0
});
if
(
static_cast
<
uint64_t
>
(
net_max_image_size
[
0
])
>
opencl_max_image_size
[
0
]
||
static_cast
<
uint64_t
>
(
net_max_image_size
[
1
])
>
opencl_max_image_size
[
1
])
{
LOG
(
INFO
)
<<
"opencl max image size "
<<
MakeString
(
opencl_max_image_size
)
<<
" vs "
<<
MakeString
(
net_max_image_size
);
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
return
MaceStatus
::
MACE_SUCCESS
;
}
#endif
}
// namespace
// Mace Tensor
...
...
@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
const
std
::
vector
<
std
::
string
>
&
output_nodes
,
const
unsigned
char
*
model_data
)
{
LOG
(
INFO
)
<<
"Initializing MaceEngine"
;
// Check avalibility
#ifdef MACE_ENABLE_OPENCL
if
(
device_type_
==
DeviceType
::
GPU
)
{
MACE_RETURN_IF_ERROR
(
CheckGPUAvalibility
(
net_def
));
}
#endif
// Get input and output information.
for
(
auto
&
input_info
:
net_def
->
input_info
())
{
input_info_map_
[
input_info
.
name
()]
=
input_info
;
...
...
mace/proto/mace.proto
浏览文件 @
88120708
...
...
@@ -20,6 +20,12 @@ enum DataType {
DT_INT32
=
4
;
}
enum
MemoryType
{
CPU_BUFFER
=
0
;
GPU_BUFFER
=
1
;
GPU_IMAGE
=
2
;
}
message
ConstTensor
{
repeated
int64
dims
=
1
;
optional
DataType
data_type
=
2
[
default
=
DT_FLOAT
];
...
...
@@ -73,8 +79,9 @@ message OperatorDef {
// for memory optimization
message
MemoryBlock
{
optional
int32
mem_id
=
1
;
optional
uint32
x
=
2
;
optional
uint32
y
=
3
;
optional
MemoryType
mem_type
=
2
;
optional
uint32
x
=
3
;
optional
uint32
y
=
4
;
}
message
MemoryArena
{
repeated
MemoryBlock
mem_block
=
1
;
...
...
mace/public/mace_runtime.h
浏览文件 @
88120708
...
...
@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
std
::
unique_ptr
<
Impl
>
impl_
;
};
// Set Key-Value store factory. (Call Once)
// Now KVStorage is used to store the built OpenCL binaries to file,
// which could speed up the GPU initialization and first run.
// If do not call this API, the initialization maybe slow for GPU.
/// \brief Set internal storage factory to store internal data. (Call once)
///
/// Now the path is used to store the built OpenCL binaries to file,
/// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetKVStorageFactory
(
std
::
shared_ptr
<
KVStorageFactory
>
storage_factory
);
// Just call once. (Not thread-safe)
// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)
// if you use gpu of specific soc.
// Using OpenCL binary will speed up the initialization.
// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed.
/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
///
/// Just call once. (Not thread-safe)
/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
/// OpenCL binary is corresponding to the OpenCL Driver version,
/// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetOpenCLBinaryPaths
(
const
std
::
vector
<
std
::
string
>
&
paths
);
// Just call once. (Not thread-safe)
// Set the path of Generated OpenCL parameter file
// if you use gpu for specific soc.
// The parameters is the local work group size tuned for specific SOC, which
// may be faster than the general parameters.
/// \brief Set the path of Generated OpenCL parameter file
///
/// Just call once. (Not thread-safe)
/// If you use gpu for specific soc, The parameters is the local work group
/// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetOpenCLParameterPath
(
const
std
::
string
&
path
);
// Set GPU hints, currently only supports Adreno GPU.
//
// Caution: this function may hurt performance if improper parameters provided.
/// \brief Set GPU hints, currently only supports Adreno GPU.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetGPUHints
(
GPUPerfHint
perf_hint
,
GPUPriorityHint
priority_hint
);
// Set OpenMP threads number and affinity policy.
//
// Caution: this function may hurt performance if improper parameters provided.
//
// num_threads_hint is only a hint. When num_threads_hint is zero or negative,
// the function will set the threads number equaling to the number of
// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
// (AFFINITY_NONE) cores according to the policy. The threads number will
// also be truncated to the corresponding cores number when num_threads_hint
// is larger than it.
//
// The OpenMP threads will be bind to (via sched_setaffinity) big cores
// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
//
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
// suggested to use AFFINITY_NONE to use all cores.
/// \brief Set OpenMP threads number and affinity policy.
///
/// Caution: this function may hurt performance if improper parameters provided.
/// When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
/// \param status MACE_SUCCESS for successful, or it can't reliabley
/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
/// suggested to use AFFINITY_NONE to use all cores.
/// \return
__attribute__
((
visibility
(
"default"
)))
MaceStatus
SetOpenMPThreadPolicy
(
int
num_threads_hint
,
CPUAffinityPolicy
policy
);
// Set OpenMP threads number and processor affinity.
//
// Caution: this function may hurt performance if improper parameters provided.
//
// This function may not work well on some chips (e.g. MTK). Setting thread
// affinity to offline cores may run very slow or unexpectedly. In such cases,
// please use SetOpenMPThreadPolicy with default policy instead.
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \param status
/// \return
__attribute__
((
visibility
(
"default"
)))
MaceStatus
SetOpenMPThreadAffinity
(
int
num_threads
,
const
std
::
vector
<
int
>
&
cpu_ids
);
// Get ARM big.LITTLE configuration.
//
// This function will detect the max frequencies of all CPU cores, and assume
// the cores with largest max frequencies as big cores, and all the remaining
// cores as little. If all cpu core's max frequencies equals, big_core_ids and
// little_core_ids will both be filled with all cpu core ids.
//
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
// detect the frequency of big-LITTLE cores (e.g. MTK).
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
__attribute__
((
visibility
(
"default"
)))
MaceStatus
GetBigLittleCoreIDs
(
std
::
vector
<
int
>
*
big_core_ids
,
std
::
vector
<
int
>
*
little_core_ids
);
...
...
mace/python/tools/convert_util.py
浏览文件 @
88120708
...
...
@@ -12,7 +12,72 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
enum
def
mace_check
(
condition
,
msg
):
if
not
condition
:
raise
Exception
(
msg
)
def
roundup_div4
(
value
):
return
int
((
value
+
3
)
/
4
)
class
OpenCLBufferType
(
enum
.
Enum
):
CONV2D_FILTER
=
0
IN_OUT_CHANNEL
=
1
ARGUMENT
=
2
IN_OUT_HEIGHT
=
3
IN_OUT_WIDTH
=
4
WINOGRAD_FILTER
=
5
DW_CONV2D_FILTER
=
6
WEIGHT_HEIGHT
=
7
WEIGHT_WIDTH
=
8
def
calculate_image_shape
(
buffer_type
,
shape
,
winograd_blk_size
=
0
):
# keep the same with mace/kernel/opencl/helper.cc
image_shape
=
[
0
,
0
]
if
buffer_type
==
OpenCLBufferType
.
CONV2D_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Conv2D filter buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
1
]
image_shape
[
1
]
=
shape
[
2
]
*
shape
[
3
]
*
roundup_div4
(
shape
[
0
])
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_CHANNEL
:
mace_check
(
len
(
shape
)
==
4
,
"Conv2D input/output buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
3
])
*
shape
[
2
]
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
]
elif
buffer_type
==
OpenCLBufferType
.
ARGUMENT
:
mace_check
(
len
(
shape
)
==
1
,
"Argument buffer should be 1D not "
+
str
(
shape
))
image_shape
[
0
]
=
roundup_div4
(
shape
[
0
])
image_shape
[
1
]
=
1
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_HEIGHT
:
mace_check
(
len
(
shape
)
==
4
,
"Input/output buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
*
roundup_div4
(
shape
[
1
])
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_WIDTH
:
mace_check
(
len
(
shape
)
==
4
,
"Input/output buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
2
])
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
]
elif
buffer_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Winograd filter buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
1
])
image_shape
[
1
]
=
(
shape
[
0
]
*
(
winograd_blk_size
+
2
)
*
(
winograd_blk_size
+
2
))
elif
buffer_type
==
OpenCLBufferType
.
DW_CONV2D_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Winograd filter buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
roundup_div4
(
shape
[
1
])
elif
buffer_type
==
OpenCLBufferType
.
WEIGHT_HEIGHT
:
mace_check
(
len
(
shape
)
==
4
,
"Weight buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
1
]
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
roundup_div4
(
shape
[
0
])
elif
buffer_type
==
OpenCLBufferType
.
WEIGHT_WIDTH
:
mace_check
(
len
(
shape
)
==
4
,
"Weight buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
1
])
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
else
:
mace_check
(
False
,
"OpenCL Image do not support type "
+
str
(
buffer_type
))
return
image_shape
mace/python/tools/converter.py
浏览文件 @
88120708
...
...
@@ -171,6 +171,13 @@ def main(unused_args):
output_graph_def
.
op
.
extend
(
cpu_graph_def
.
op
)
output_graph_def
.
mem_arena
.
mem_block
.
extend
(
cpu_graph_def
.
mem_arena
.
mem_block
)
output_graph_arg_names
=
set
()
for
arg
in
output_graph_def
.
arg
:
output_graph_arg_names
.
add
(
arg
.
name
)
for
arg
in
cpu_graph_def
.
arg
:
if
arg
.
name
not
in
output_graph_arg_names
:
output_graph_def
.
arg
.
extend
(
arg
)
print
"Merge done"
else
:
option
.
device
=
device_type_map
[
FLAGS
.
runtime
]
...
...
mace/python/tools/converter_tool/base_converter.py
浏览文件 @
88120708
...
...
@@ -163,6 +163,7 @@ class MaceKeyword(object):
mace_op_data_type_str
=
'T'
mace_offset_str
=
'offset'
mace_from_caffe_str
=
'from_caffe'
mace_opencl_max_image_size
=
"opencl_max_image_size"
class
TransformerRule
(
Enum
):
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
88120708
...
...
@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
from
mace.python.tools.converter_tool.base_converter
import
MaceOp
from
mace.python.tools.converter_tool.base_converter
import
PaddingMode
from
mace.python.tools.converter_tool.base_converter
import
TransformerRule
from
mace.python.tools.convert_util
import
calculate_image_shape
from
mace.python.tools.convert_util
import
mace_check
OPENCL_IMAGE_MAX_SIZE
=
16384
from
mace.python.tools.convert_util
import
OpenCLBufferType
class
OpenCLBufferType
(
enum
.
Enum
):
CONV2D_FILTER
=
0
IN_OUT_CHANNEL
=
1
ARGUMENT
=
2
IN_OUT_HEIGHT
=
3
IN_OUT_WIDTH
=
4
WINOGRAD_FILTER
=
5
DW_CONV2D_FILTER
=
6
WEIGHT_HEIGHT
=
7
WEIGHT_WIDTH
=
8
OPENCL_IMAGE_MAX_SIZE
=
16384
class
Transformer
(
base_converter
.
ConverterInterface
):
...
...
@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
self
.
_producer
=
{}
self
.
_target_data_format
=
DataFormat
.
NHWC
self
.
_input_output_added
=
False
self
.
_opencl_max_image_size
=
[
0
,
0
]
if
self
.
_option
.
device
==
DeviceType
.
CPU
.
value
:
self
.
_target_data_format
=
DataFormat
.
NCHW
...
...
@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
arg
.
name
=
MaceKeyword
.
mace_mode
arg
.
i
=
0
tensor_shape
=
list
(
self
.
_consts
[
input_name
].
dims
)
if
input_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
blk_sqr
=
op
.
output_shape
[
0
].
dims
[
0
]
wino_blk
=
int
(
np
.
sqrt
(
blk_sqr
))
-
2
wino_arg
=
op_def
.
arg
.
add
()
wino_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
wino_arg
.
i
=
wino_blk
img_shape
=
calculate_image_shape
(
input_type
,
tensor_shape
,
wino_blk
)
else
:
img_shape
=
calculate_image_shape
(
input_type
,
tensor_shape
)
op
.
input
[
input_idx
]
=
output_name
# update OpenCL max image size
self
.
_opencl_max_image_size
[
0
]
=
max
(
self
.
_opencl_max_image_size
[
0
],
img_shape
[
0
])
self
.
_opencl_max_image_size
[
1
]
=
max
(
self
.
_opencl_max_image_size
[
1
],
img_shape
[
1
])
def
transform_buffer_image
(
self
):
if
self
.
_option
.
device
!=
DeviceType
.
GPU
.
value
:
return
False
...
...
@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
MaceKeyword
.
mace_activation_type_str
).
s
==
ActivationType
.
PRELU
.
name
:
# noqa
self
.
buffer_to_image
(
op
,
1
,
OpenCLBufferType
.
ARGUMENT
)
# Add OpenCL max image size
arg
=
net
.
arg
.
add
()
arg
.
name
=
MaceKeyword
.
mace_opencl_max_image_size
arg
.
ints
.
extend
(
self
.
_opencl_max_image_size
)
for
input_node
in
self
.
_option
.
input_nodes
.
values
():
new_input_name
=
MaceKeyword
.
mace_input_node_name
\
+
'_'
+
input_node
.
name
...
...
mace/python/tools/memory_optimizer.py
浏览文件 @
88120708
...
...
@@ -16,6 +16,24 @@ import sys
import
operator
from
mace.proto
import
mace_pb2
from
mace.python.tools.converter_tool
import
base_converter
as
cvt
from
mace.python.tools.convert_util
import
calculate_image_shape
from
mace.python.tools.convert_util
import
OpenCLBufferType
class
MemoryBlock
(
object
):
def
__init__
(
self
,
mem_type
,
block
):
self
.
_mem_type
=
mem_type
self
.
_block
=
block
@
property
def
mem_type
(
self
):
return
self
.
_mem_type
@
property
def
block
(
self
):
return
self
.
_block
class
MemoryOptimizer
(
object
):
def
__init__
(
self
,
net_def
):
...
...
@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
self
.
op_mem
=
{}
# op_name->mem_id
self
.
mem_block
=
{}
# mem_id->[size] or mem_id->[x, y]
self
.
total_mem_count
=
0
self
.
total_cpu_mem_count
=
0
self
.
input_ref_counter
=
{}
self
.
mem_ref_counter
=
{}
...
...
@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
return
True
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
return
[
reduce
(
operator
.
mul
,
output_shape
,
1
)]
return
MemoryBlock
(
mace_pb2
.
CPU_BUFFER
,
[
reduce
(
operator
.
mul
,
output_shape
,
1
)])
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
return
memory_block
.
block
[
0
]
def
sub_mem_block
(
self
,
mem_block1
,
mem_block2
):
return
self
.
mem_size
(
mem_block1
)
-
self
.
mem_size
(
mem_block2
)
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
return
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
])]
return
MemoryBlock
(
old_mem_block
.
mem_type
,
[
max
(
old_mem_block
.
block
[
0
],
op_mem_block
.
block
[
0
])])
def
add_net_mem_blocks
(
self
):
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
mem_type
=
self
.
mem_block
[
mem
].
mem_type
block
.
x
=
self
.
mem_block
[
mem
].
block
[
0
]
block
.
y
=
1
def
get_total_origin_mem_size
(
self
):
...
...
@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
def
get_total_optimized_mem_size
(
self
):
optimized_mem_size
=
0
for
mem
in
self
.
mem_block
:
print
mem
,
self
.
mem_block
[
mem
]
print
mem
,
self
.
mem_block
[
mem
]
.
mem_type
,
self
.
mem_block
[
mem
].
block
optimized_mem_size
+=
self
.
mem_size
(
self
.
mem_block
[
mem
])
return
optimized_mem_size
...
...
@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
best_mem_waste_size
=
sys
.
maxint
for
mid
in
self
.
idle_mem
:
old_mem_block
=
self
.
mem_block
[
mid
]
if
old_mem_block
.
mem_type
!=
op_mem_block
.
mem_type
:
continue
new_mem_block
=
self
.
resize_mem_block
(
old_mem_block
,
op_mem_block
)
add_mem_size
=
self
.
sub_mem_block
(
new_mem_block
,
...
...
@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
for
arg
in
op
.
arg
:
if
arg
.
name
==
'mode'
and
arg
.
i
==
0
:
return
False
elif
op
.
type
==
'Shape'
:
for
i
in
range
(
len
(
op
.
output
)):
mem_id
=
self
.
total_cpu_mem_count
self
.
total_cpu_mem_count
+=
1
op_mem_block
=
self
.
get_op_mem_block
(
op
.
type
,
op
.
output_shape
[
i
].
dims
)
self
.
mem_block
[
mem_id
]
=
op_mem_block
return
False
return
op
.
type
!=
'ImageToBuffer'
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
mem_block
=
[
0
,
0
]
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
mem_block
[
0
]
=
output_shape
[
2
]
mem_block
[
1
]
=
output_shape
[
0
]
*
int
((
output_shape
[
1
]
+
3
)
/
4
)
buffer_shape
=
list
(
output_shape
)
+
[
1
]
mem_block
=
MemoryBlock
(
mace_pb2
.
GPU_IMAGE
,
calculate_image_shape
(
OpenCLBufferType
.
IN_OUT_HEIGHT
,
buffer_shape
))
elif
op_type
==
'Shape'
:
mem_block
[
0
]
=
output_shape
[
0
]
mem_block
[
1
]
=
1
mem_block
=
MemoryBlock
(
mace_pb2
.
CPU_BUFFER
,
[
output_shape
[
0
],
1
])
else
:
if
len
(
output_shape
)
==
2
:
# only support fc/softmax
mem_block
[
0
]
=
int
((
output_shape
[
1
]
+
3
)
/
4
)
mem_block
[
1
]
=
output_shape
[
0
]
buffer_shape
=
[
output_shape
[
0
],
1
,
1
,
output_shape
[
1
]]
elif
len
(
output_shape
)
==
4
:
mem_block
[
0
]
=
output_shape
[
2
]
*
int
((
output_shape
[
3
]
+
3
)
/
4
)
mem_block
[
1
]
=
output_shape
[
0
]
*
output_shape
[
1
]
buffer_shape
=
output_shape
else
:
raise
Exception
(
'output shape dim size is not 2 or 4.'
)
mem_block
=
MemoryBlock
(
mace_pb2
.
GPU_IMAGE
,
calculate_image_shape
(
OpenCLBufferType
.
IN_OUT_CHANNEL
,
buffer_shape
))
return
mem_block
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
*
memory_block
[
1
]
*
4
if
memory_block
.
mem_type
==
mace_pb2
.
GPU_IMAGE
:
return
memory_block
.
block
[
0
]
*
memory_block
.
block
[
1
]
*
4
else
:
return
memory_block
.
block
[
0
]
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
resize_mem_block
=
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
]),
max
(
old_mem_block
[
1
],
op_mem_block
[
1
])
]
resize_mem_block
=
MemoryBlock
(
old_mem_block
.
mem_type
,
[
max
(
old_mem_block
.
block
[
0
],
op_mem_block
.
block
[
0
]),
max
(
old_mem_block
.
block
[
1
],
op_mem_block
.
block
[
1
])
])
return
resize_mem_block
def
add_net_mem_blocks
(
self
):
max_image_size_x
=
0
max_image_size_y
=
0
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
y
=
self
.
mem_block
[
mem
][
1
]
block
.
mem_type
=
self
.
mem_block
[
mem
].
mem_type
block
.
x
=
self
.
mem_block
[
mem
].
block
[
0
]
block
.
y
=
self
.
mem_block
[
mem
].
block
[
1
]
if
self
.
mem_block
[
mem
].
mem_type
==
mace_pb2
.
GPU_IMAGE
:
max_image_size_x
=
max
(
max_image_size_x
,
block
.
x
)
max_image_size_y
=
max
(
max_image_size_y
,
block
.
y
)
# Update OpenCL max image size
net_ocl_max_img_size_arg
=
None
for
arg
in
self
.
net_def
.
arg
:
if
arg
.
name
==
cvt
.
MaceKeyword
.
mace_opencl_max_image_size
:
net_ocl_max_img_size_arg
=
arg
max_image_size_x
=
max
(
arg
.
ints
[
0
],
max_image_size_x
)
max_image_size_y
=
max
(
arg
.
ints
[
1
],
max_image_size_y
)
break
if
net_ocl_max_img_size_arg
is
None
:
net_ocl_max_img_size_arg
=
self
.
net_def
.
arg
.
add
()
net_ocl_max_img_size_arg
.
name
=
\
cvt
.
MaceKeyword
.
mace_opencl_max_image_size
net_ocl_max_img_size_arg
.
ints
[:]
=
[
max_image_size_x
,
max_image_size_y
]
def
mem_id_base
(
self
):
return
20000
...
...
mace/python/tools/model.jinja2
浏览文件 @
88120708
...
...
@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
...
...
mace/test/mace_api_mt_test.cc
浏览文件 @
88120708
...
...
@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
res
[
input_names
[
i
]]
=
mem_id
;
...
...
@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
res
[
output_names
[
i
]]
=
mem_id
;
...
...
mace/test/mace_api_test.cc
浏览文件 @
88120708
...
...
@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
res
[
input_names
[
i
]]
=
mem_id
;
...
...
@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
res
[
output_names
[
i
]]
=
mem_id
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录