Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
0d94aeae
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
0d94aeae
编写于
7月 30, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'fix-opencl-cache-bug' into 'master'
Fix opencl cache update bug. See merge request !687
上级
58360e63
88120708
变更
48
展开全部
隐藏空白更改
内联
并排
Showing
48 changed file
with
1178 addition
and
570 deletion
+1178
-570
mace/core/file_storage.cc
mace/core/file_storage.cc
+11
-4
mace/core/file_storage.h
mace/core/file_storage.h
+1
-0
mace/core/runtime/opencl/opencl_allocator.cc
mace/core/runtime/opencl/opencl_allocator.cc
+11
-4
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+111
-35
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+21
-7
mace/core/runtime/opencl/opencl_wrapper.cc
mace/core/runtime/opencl/opencl_wrapper.cc
+319
-161
mace/core/workspace.cc
mace/core/workspace.cc
+22
-20
mace/examples/cli/example.cc
mace/examples/cli/example.cc
+4
-1
mace/kernels/opencl/activation.cc
mace/kernels/opencl/activation.cc
+4
-2
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+4
-2
mace/kernels/opencl/batch_norm.cc
mace/kernels/opencl/batch_norm.cc
+4
-2
mace/kernels/opencl/bias_add.cc
mace/kernels/opencl/bias_add.cc
+3
-2
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+5
-3
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+5
-3
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+19
-10
mace/kernels/opencl/conv_2d_1x1.cc
mace/kernels/opencl/conv_2d_1x1.cc
+31
-23
mace/kernels/opencl/conv_2d_3x3.cc
mace/kernels/opencl/conv_2d_3x3.cc
+27
-20
mace/kernels/opencl/conv_2d_general.cc
mace/kernels/opencl/conv_2d_general.cc
+31
-24
mace/kernels/opencl/crop.cc
mace/kernels/opencl/crop.cc
+16
-8
mace/kernels/opencl/deconv_2d_opencl.cc
mace/kernels/opencl/deconv_2d_opencl.cc
+4
-2
mace/kernels/opencl/depth_to_space.cc
mace/kernels/opencl/depth_to_space.cc
+6
-3
mace/kernels/opencl/depthwise_conv.cc
mace/kernels/opencl/depthwise_conv.cc
+29
-21
mace/kernels/opencl/eltwise.cc
mace/kernels/opencl/eltwise.cc
+4
-2
mace/kernels/opencl/fully_connected.cc
mace/kernels/opencl/fully_connected.cc
+8
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+38
-27
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+11
-11
mace/kernels/opencl/image_to_buffer.cc
mace/kernels/opencl/image_to_buffer.cc
+6
-4
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+4
-2
mace/kernels/opencl/out_of_range_check_test.cc
mace/kernels/opencl/out_of_range_check_test.cc
+11
-4
mace/kernels/opencl/pad.cc
mace/kernels/opencl/pad.cc
+4
-2
mace/kernels/opencl/pooling.cc
mace/kernels/opencl/pooling.cc
+22
-13
mace/kernels/opencl/reduce_mean_opencl.cc
mace/kernels/opencl/reduce_mean_opencl.cc
+8
-4
mace/kernels/opencl/resize_bilinear.cc
mace/kernels/opencl/resize_bilinear.cc
+28
-19
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+5
-2
mace/kernels/opencl/softmax.cc
mace/kernels/opencl/softmax.cc
+18
-12
mace/kernels/opencl/space_to_batch.cc
mace/kernels/opencl/space_to_batch.cc
+6
-3
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+12
-6
mace/libmace/mace.cc
mace/libmace/mace.cc
+44
-0
mace/proto/mace.proto
mace/proto/mace.proto
+9
-2
mace/public/mace_runtime.h
mace/public/mace_runtime.h
+77
-51
mace/python/tools/convert_util.py
mace/python/tools/convert_util.py
+65
-0
mace/python/tools/converter.py
mace/python/tools/converter.py
+7
-0
mace/python/tools/converter_tool/base_converter.py
mace/python/tools/converter_tool/base_converter.py
+1
-0
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+20
-12
mace/python/tools/memory_optimizer.py
mace/python/tools/memory_optimizer.py
+77
-31
mace/python/tools/model.jinja2
mace/python/tools/model.jinja2
+1
-0
mace/test/mace_api_mt_test.cc
mace/test/mace_api_mt_test.cc
+2
-0
mace/test/mace_api_test.cc
mace/test/mace_api_test.cc
+2
-0
未找到文件。
mace/core/file_storage.cc
浏览文件 @
0d94aeae
...
...
@@ -37,8 +37,8 @@ int FileStorage::Load() {
struct
stat
st
;
if
(
stat
(
file_path_
.
c_str
(),
&
st
)
==
-
1
)
{
if
(
errno
==
ENOENT
)
{
LOG
(
INFO
)
<<
"File "
<<
file_path_
<<
" does not exist"
;
VLOG
(
1
)
<<
"File "
<<
file_path_
<<
" does not exist"
;
return
0
;
}
else
{
LOG
(
WARNING
)
<<
"Stat file "
<<
file_path_
...
...
@@ -121,13 +121,20 @@ int FileStorage::Load() {
return
0
;
}
void
FileStorage
::
Clear
()
{
utils
::
WriteLock
lock
(
&
data_mutex_
);
data_
.
clear
();
data_changed_
=
true
;
}
bool
FileStorage
::
Insert
(
const
std
::
string
&
key
,
const
std
::
vector
<
unsigned
char
>
&
value
)
{
utils
::
WriteLock
lock
(
&
data_mutex_
);
auto
res
=
data_
.
emplace
(
key
,
value
);
if
(
res
.
second
)
{
data_
changed_
=
tr
ue
;
if
(
!
res
.
second
)
{
data_
[
key
]
=
val
ue
;
}
data_changed_
=
true
;
return
true
;
}
...
...
mace/core/file_storage.h
浏览文件 @
0d94aeae
...
...
@@ -30,6 +30,7 @@ class FileStorage : public KVStorage {
public:
int
Load
()
override
;
void
Clear
()
override
;
bool
Insert
(
const
std
::
string
&
key
,
const
std
::
vector
<
unsigned
char
>
&
value
)
override
;
const
std
::
vector
<
unsigned
char
>
*
Find
(
const
std
::
string
&
key
)
override
;
...
...
mace/core/runtime/opencl/opencl_allocator.cc
浏览文件 @
0d94aeae
...
...
@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void
*
mapped_ptr
=
queue
.
enqueueMapBuffer
(
*
cl_buffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
offset
,
nbytes
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Map buffer failed, error: "
<<
OpenCLErrorToString
(
error
);
mapped_ptr
=
nullptr
;
}
return
mapped_ptr
;
}
...
...
@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
*
cl_image
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
origin
,
region
,
mapped_image_pitch
->
data
(),
mapped_image_pitch
->
data
()
+
1
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Map Image failed, error: "
<<
OpenCLErrorToString
(
error
);
mapped_ptr
=
nullptr
;
}
return
mapped_ptr
;
}
...
...
@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto
queue
=
OpenCLRuntime
::
Global
()
->
command_queue
();
cl_int
error
=
queue
.
enqueueUnmapMemObject
(
*
cl_buffer
,
mapped_ptr
,
nullptr
,
nullptr
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Unmap buffer failed, error: "
<<
OpenCLErrorToString
(
error
);
}
}
bool
OpenCLAllocator
::
OnHost
()
const
{
return
false
;
}
...
...
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
0d94aeae
...
...
@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
OpenCLRuntime
::
OpenCLRuntime
()
:
precompiled_binary_storage_
(
nullptr
),
cache_storage_
(
nullptr
),
is_profiling_enabled_
(
false
)
{
is_opencl_avaliable_
(
false
),
is_profiling_enabled_
(
false
),
opencl_version_
(
CL_VER_UNKNOWN
),
gpu_type_
(
UNKNOWN
)
{
std
::
vector
<
cl
::
Platform
>
all_platforms
;
cl
::
Platform
::
get
(
&
all_platforms
);
if
(
all_platforms
.
size
()
==
0
)
{
LOG
(
FATAL
)
<<
"No OpenCL platforms found"
;
LOG
(
ERROR
)
<<
"No OpenCL platforms found"
;
return
;
}
cl
::
Platform
default_platform
=
all_platforms
[
0
];
std
::
stringstream
ss
;
...
...
@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
std
::
vector
<
cl
::
Device
>
all_devices
;
default_platform
.
getDevices
(
CL_DEVICE_TYPE_ALL
,
&
all_devices
);
if
(
all_devices
.
size
()
==
0
)
{
LOG
(
FATAL
)
<<
"No OpenCL devices found"
;
LOG
(
ERROR
)
<<
"No OpenCL devices found"
;
return
;
}
bool
gpu_detected
=
false
;
...
...
@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
const
std
::
string
device_version
=
device
.
getInfo
<
CL_DEVICE_VERSION
>
();
opencl_version_
=
ParseDeviceVersion
(
device_version
);
if
(
opencl_version_
==
OpenCLVersion
::
CL_VER_UNKNOWN
)
{
return
;
}
VLOG
(
1
)
<<
"Using device: "
<<
device_name
;
break
;
}
}
if
(
!
gpu_detected
)
{
LOG
(
FATAL
)
<<
"No GPU device found"
;
LOG
(
ERROR
)
<<
"No GPU device found"
;
return
;
}
cl_command_queue_properties
properties
=
0
;
...
...
@@ -384,15 +393,22 @@ OpenCLRuntime::OpenCLRuntime():
new
cl
::
Context
({
*
device_
},
nullptr
,
nullptr
,
nullptr
,
&
err
));
}
}
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
;
}
command_queue_
=
std
::
make_shared
<
cl
::
CommandQueue
>
(
*
context_
,
*
device_
,
properties
,
&
err
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
;
}
extern
std
::
shared_ptr
<
KVStorageFactory
>
kStorageFactory
;
std
::
string
cached_binary_platform_info
;
if
(
kStorageFactory
!=
nullptr
)
{
cache_storage_
=
kStorageFactory
->
CreateStorage
(
kPrecompiledProgramFileName
);
...
...
@@ -405,17 +421,17 @@ OpenCLRuntime::OpenCLRuntime():
auto
platform_info_array
=
this
->
cache_storage_
->
Find
(
kOpenCLPlatformInfoKey
);
if
(
platform_info_array
!=
nullptr
)
{
cached_binary_platform_info
_
=
cached_binary_platform_info
=
std
::
string
(
platform_info_array
->
begin
(),
platform_info_array
->
end
());
if
(
cached_binary_platform_info
!=
platform_info_
)
{
cache_storage_
->
Clear
();
}
}
}
if
(
cached_binary_platform_info_
!=
platform_info_
)
{
if
(
OpenCLRuntime
::
kPrecompiledBinaryPath
.
empty
())
{
LOG
(
WARNING
)
<<
"There is no precompiled OpenCL binary in"
" all OpenCL binary paths"
;
}
else
{
if
(
cached_binary_platform_info
!=
platform_info_
)
{
if
(
!
OpenCLRuntime
::
kPrecompiledBinaryPath
.
empty
())
{
precompiled_binary_storage_
.
reset
(
new
FileStorage
(
OpenCLRuntime
::
kPrecompiledBinaryPath
));
if
(
precompiled_binary_storage_
->
Load
()
!=
0
)
{
...
...
@@ -446,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
}
else
{
this
->
out_of_range_check_
=
false
;
}
is_opencl_avaliable_
=
true
;
}
OpenCLRuntime
::~
OpenCLRuntime
()
{
...
...
@@ -456,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
device_
.
reset
();
}
bool
OpenCLRuntime
::
is_opencl_avaliable
()
{
static
const
uint64_t
kMinWorkGroupSize
=
64
;
return
is_opencl_avaliable_
&&
GetDeviceMaxWorkGroupSize
()
>=
kMinWorkGroupSize
;
}
cl
::
Context
&
OpenCLRuntime
::
context
()
{
return
*
context_
;
}
cl
::
Device
&
OpenCLRuntime
::
device
()
{
return
*
device_
;
}
...
...
@@ -476,11 +500,6 @@ bool OpenCLRuntime::BuildProgramFromCache(
cl
::
Program
*
program
)
{
// Find from binary
if
(
this
->
cache_storage_
==
nullptr
)
return
false
;
if
(
cached_binary_platform_info_
!=
platform_info_
)
{
VLOG
(
3
)
<<
"cached OpenCL binary version is not same"
" with current version"
;
return
false
;
}
auto
content
=
this
->
cache_storage_
->
Find
(
built_program_key
);
if
(
content
==
nullptr
)
{
return
false
;
...
...
@@ -539,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return
true
;
}
void
OpenCLRuntime
::
BuildProgramFromSource
(
bool
OpenCLRuntime
::
BuildProgramFromSource
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
...
...
@@ -563,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
LOG
(
WARNING
)
<<
"Build program "
<<
program_name
<<
" from source failed: "
<<
MakeString
(
ret
);
return
;
return
false
;
}
// Keep built program binary
...
...
@@ -573,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
cl_int
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARY_SIZES
,
sizeof
(
size_t
)
*
device_list_size
,
program_binary_sizes
.
get
(),
nullptr
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
std
::
unique_ptr
<
std
::
unique_ptr
<
unsigned
char
[]
>
[]
>
program_binaries
(
new
std
::
unique_ptr
<
unsigned
char
[]
>
[
device_list_size
]);
for
(
cl_uint
i
=
0
;
i
<
device_list_size
;
++
i
)
{
...
...
@@ -584,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARIES
,
sizeof
(
unsigned
char
*
)
*
device_list_size
,
program_binaries
.
get
(),
nullptr
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
std
::
vector
<
unsigned
char
>
content
(
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
()),
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
())
+
...
...
@@ -601,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
VLOG
(
3
)
<<
"Program from source: "
<<
built_program_key
;
}
return
true
;
}
void
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_name
,
bool
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
)
{
...
...
@@ -618,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
ret
=
BuildProgramFromPrecompiledBinary
(
built_program_key
,
build_options_str
,
program
);
if
(
!
ret
)
{
BuildProgramFromSource
(
program_name
,
built_program_key
,
build_options_str
,
program
);
ret
=
BuildProgramFromSource
(
program_name
,
built_program_key
,
build_options_str
,
program
);
}
}
return
ret
;
}
cl
::
Kernel
OpenCLRuntime
::
BuildKernel
(
MaceStatus
OpenCLRuntime
::
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
)
{
const
std
::
set
<
std
::
string
>
&
build_options
,
cl
::
Kernel
*
kernel
)
{
std
::
string
build_options_str
;
for
(
auto
&
option
:
build_options
)
{
build_options_str
+=
" "
+
option
;
...
...
@@ -640,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if
(
built_program_it
!=
built_program_map_
.
end
())
{
program
=
built_program_it
->
second
;
}
else
{
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
&
program
);
bool
ret
=
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
&
program
);
if
(
!
ret
)
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
built_program_map_
.
emplace
(
built_program_key
,
program
);
}
return
cl
::
Kernel
(
program
,
kernel_name
.
c_str
());
cl_int
err
;
*
kernel
=
cl
::
Kernel
(
program
,
kernel_name
.
c_str
(),
&
err
);
MACE_CL_RET_STATUS
(
err
);
return
MaceStatus
::
MACE_SUCCESS
;
}
void
OpenCLRuntime
::
SaveBuiltCLProgram
()
{
...
...
@@ -668,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
uint64_t
OpenCLRuntime
::
GetDeviceMaxWorkGroupSize
()
{
uint64_t
size
=
0
;
device_
->
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
uint64_t
OpenCLRuntime
::
GetDeviceMaxMemAllocSize
()
{
uint64_t
size
=
0
;
device_
->
getInfo
(
CL_DEVICE_MAX_MEM_ALLOC_SIZE
,
&
size
);
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_MAX_MEM_ALLOC_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
bool
OpenCLRuntime
::
IsImageSupport
()
{
cl_bool
res
;
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE_SUPPORT
,
&
res
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
return
res
==
CL_TRUE
;
}
std
::
vector
<
uint64_t
>
OpenCLRuntime
::
GetMaxImage2DSize
()
{
size_t
max_height
,
max_width
;
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE2D_MAX_HEIGHT
,
&
max_height
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
{};
}
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE2D_MAX_WIDTH
,
&
max_width
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
{};
}
return
{
max_height
,
max_width
};
}
uint64_t
OpenCLRuntime
::
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
size
=
0
;
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
cl_int
err
=
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
uint64_t
OpenCLRuntime
::
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
size
=
0
;
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WAVE_SIZE_QCOM
,
&
size
);
cl_int
err
=
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WAVE_SIZE_QCOM
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
}
...
...
@@ -718,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
}
else
if
(
words
[
1
]
==
"1.0"
)
{
return
OpenCLVersion
::
CL_VER_1_0
;
}
else
{
LOG
(
FATAL
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
return
OpenCLVersion
::
CL_VER_
1_0
;
LOG
(
ERROR
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
return
OpenCLVersion
::
CL_VER_
UNKNOWN
;
}
}
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
0d94aeae
...
...
@@ -42,13 +42,23 @@ enum OpenCLVersion {
CL_VER_1_1
,
CL_VER_1_2
,
CL_VER_2_0
,
CL_VER_UNKNOWN
,
};
const
std
::
string
OpenCLErrorToString
(
cl_int
error
);
#define MACE_CHECK_CL_SUCCESS(error) \
MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error)
#define MACE_CL_RET_ERROR(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return error; \
}
#define MACE_CL_RET_STATUS(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return MaceStatus::MACE_OUT_OF_RESOURCES; \
}
class
OpenCLProfilingTimer
:
public
Timer
{
public:
...
...
@@ -81,19 +91,23 @@ class OpenCLRuntime {
const
std
::
string
platform_info
()
const
;
uint64_t
device_global_mem_cache_size
()
const
;
uint32_t
device_compute_units
()
const
;
bool
is_opencl_avaliable
();
void
GetCallStats
(
const
cl
::
Event
&
event
,
CallStats
*
stats
);
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetDeviceMaxMemAllocSize
();
bool
IsImageSupport
();
std
::
vector
<
uint64_t
>
GetMaxImage2DSize
();
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
bool
IsNonUniformWorkgroupsSupported
()
const
;
bool
IsOutOfRangeCheckEnabled
()
const
;
bool
is_profiling_enabled
()
const
;
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
MaceStatus
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
const
std
::
set
<
std
::
string
>
&
build_options
,
cl
::
Kernel
*
kernel
);
void
SaveBuiltCLProgram
();
...
...
@@ -103,7 +117,7 @@ class OpenCLRuntime {
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
void
BuildProgram
(
const
std
::
string
&
program_file_name
,
bool
BuildProgram
(
const
std
::
string
&
program_file_name
,
const
std
::
string
&
binary_file_name
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
);
...
...
@@ -115,7 +129,7 @@ class OpenCLRuntime {
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
cl
::
Program
*
program
);
void
BuildProgramFromSource
(
bool
BuildProgramFromSource
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
...
...
@@ -125,6 +139,7 @@ class OpenCLRuntime {
private:
std
::
unique_ptr
<
KVStorage
>
precompiled_binary_storage_
;
std
::
unique_ptr
<
KVStorage
>
cache_storage_
;
bool
is_opencl_avaliable_
;
bool
is_profiling_enabled_
;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
...
...
@@ -136,7 +151,6 @@ class OpenCLRuntime {
std
::
string
platform_info_
;
OpenCLVersion
opencl_version_
;
std
::
string
precompiled_binary_platform_info_
;
std
::
string
cached_binary_platform_info_
;
bool
out_of_range_check_
;
uint64_t
device_gloabl_mem_cache_size_
;
uint32_t
device_compute_units_
;
...
...
mace/core/runtime/opencl/opencl_wrapper.cc
浏览文件 @
0d94aeae
此差异已折叠。
点击以展开。
mace/core/workspace.cc
浏览文件 @
0d94aeae
...
...
@@ -204,26 +204,28 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
if
(
device_type
==
DeviceType
::
GPU
)
{
// TODO(liuqi): refactor based on PB
if
(
mem_block
.
mem_id
()
>=
20000
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
());
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
}
else
{
if
(
mem_block
.
mem_id
()
<
20000
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
device_type
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
if
(
mem_block
.
mem_type
()
==
MemoryType
::
CPU_BUFFER
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
CPU
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
else
if
(
mem_block
.
mem_type
()
==
MemoryType
::
GPU_IMAGE
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
());
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
else
if
(
mem_block
.
mem_type
()
==
MemoryType
::
GPU_BUFFER
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
}
VLOG
(
3
)
<<
"Preallocate buffer to tensors"
;
...
...
mace/examples/cli/example.cc
浏览文件 @
0d94aeae
...
...
@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
#endif
if
(
create_engine_status
!=
MaceStatus
::
MACE_SUCCESS
)
{
std
::
cerr
<<
"Create engine error, please check the arguments"
<<
std
::
endl
;
std
::
cerr
<<
"Create engine error, please check the arguments first, "
<<
"if correct, the device may not run the model, "
<<
"please fall back to other strategy."
<<
std
::
endl
;
exit
(
1
);
}
...
...
mace/kernels/opencl/activation.cc
浏览文件 @
0d94aeae
...
...
@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
0d94aeae
...
...
@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"addn_opencl_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/batch_norm.cc
浏览文件 @
0d94aeae
...
...
@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/bias_add.cc
浏览文件 @
0d94aeae
...
...
@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
0d94aeae
...
...
@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
cl
::
Kernel
b2f_kernel
;
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
));
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
0d94aeae
...
...
@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
0d94aeae
...
...
@@ -24,12 +24,18 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
}
...
...
@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if
(
input0
->
dim
(
3
)
%
4
==
0
)
{
built_options
.
emplace
(
"-DDIVISIBLE_FOUR"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
...
...
@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/conv_2d_1x1.cc
浏览文件 @
0d94aeae
...
...
@@ -27,30 +27,36 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
const
uint32_t
lws_limit
=
128
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
if
((
1
<
lws
[
1
]
&&
lws
[
1
]
<
base
)
&&
gws
[
0
]
>=
lws_limit
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
if
((
1
<
lws
[
1
]
&&
lws
[
1
]
<
base
)
&&
gws
[
0
]
>=
lws_limit
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
);
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/conv_2d_3x3.cc
浏览文件 @
0d94aeae
...
...
@@ -26,25 +26,30 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
5
+
4
+
5
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
),
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
),
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/conv_2d_general.cc
浏览文件 @
0d94aeae
...
...
@@ -30,30 +30,35 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const
uint32_t
kernel_size
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
gws
[
2
]
<
lws_limit
)
{
lws
[
2
]
=
gws
[
2
];
}
else
{
lws
[
2
]
=
base
;
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
gws
[
2
]
<
lws_limit
)
{
lws
[
2
]
=
gws
[
2
];
}
else
{
lws
[
2
]
=
base
;
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
2
),
filter
->
dim
(
3
));
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
filter
->
dim
(
2
)
*
filter
->
dim
(
3
),
*
kwg_size
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/crop.cc
浏览文件 @
0d94aeae
...
...
@@ -24,12 +24,18 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
}
...
...
@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"crop"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"crop"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"crop_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/deconv_2d_opencl.cc
浏览文件 @
0d94aeae
...
...
@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"deconv2d_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depth_to_space.cc
浏览文件 @
0d94aeae
...
...
@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
}
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depthwise_conv.cc
浏览文件 @
0d94aeae
...
...
@@ -26,27 +26,33 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
1
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
]
/
8
,
kwg_size
/
lws
[
1
]);
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
),
kwg_size
/
lws
[
1
]);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
]
/
8
,
kwg_size
/
lws
[
1
]);
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
),
kwg_size
/
lws
[
1
]);
}
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]),
1
);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]),
1
);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/eltwise.cc
浏览文件 @
0d94aeae
...
...
@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
std
::
string
tuning_key
=
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/fully_connected.cc
浏览文件 @
0d94aeae
...
...
@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
,
kernel
)
);
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
...
...
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
,
kernel
));
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
...
@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
0d94aeae
...
...
@@ -245,23 +245,27 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
}
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
MaceStatus
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
...
...
@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
MACE_CHECK
(
params
[
i
]
!=
0
);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
...
...
@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
}
}
else
{
timer
->
ClearTiming
();
...
...
@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
internal_gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
...
@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
}
}
...
...
@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
cl_int
err
=
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
MACE_CL_RET_STATUS
(
err
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
event
](
CallStats
*
stats
)
{
...
...
@@ -388,13 +394,14 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
};
}
return
MaceStatus
::
MACE_SUCCESS
;
}
void
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
MaceStatus
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
...
...
@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
MACE_CHECK
(
params
[
i
]
!=
0
);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
...
...
@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
}
}
else
{
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
...
@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
}
}
...
...
@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
cl_int
err
=
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
MACE_CL_RET_STATUS
(
err
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
};
}
return
MaceStatus
::
MACE_SUCCESS
;
}
}
// namespace kernels
...
...
mace/kernels/opencl/helper.h
浏览文件 @
0d94aeae
...
...
@@ -65,17 +65,17 @@ std::string DtToCLDt(const DataType dt);
std
::
string
DtToUpstreamCLDt
(
const
DataType
dt
);
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
void
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
MaceStatus
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
MaceStatus
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
inline
void
SetFuture
(
StatsFuture
*
future
,
const
cl
::
Event
&
event
)
{
if
(
future
!=
nullptr
)
{
...
...
mace/kernels/opencl/image_to_buffer.cc
浏览文件 @
0d94aeae
...
...
@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
kernel_error_
->
UnMap
();
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
cl
::
Kernel
b2f_kernel
;
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
));
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
0d94aeae
...
...
@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
batch
,
height
,
width
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/out_of_range_check_test.cc
浏览文件 @
0d94aeae
...
...
@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
kernel_error
->
UnMap
();
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
cl
::
Kernel
b2f_kernel
;
cl_int
error
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
);
if
(
error
!=
CL_SUCCESS
)
{
return
false
;
}
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
cl
::
Event
event
;
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
...
...
@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
return
false
;
}
runtime
->
command_queue
().
finish
();
bool
is_out_of_range
=
false
;
...
...
mace/kernels/opencl/pad.cc
浏览文件 @
0d94aeae
...
...
@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"pad"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"pad"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/pooling.cc
浏览文件 @
0d94aeae
...
...
@@ -25,18 +25,23 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std
::
string
tuning_key
=
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/reduce_mean_opencl.cc
浏览文件 @
0d94aeae
...
...
@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"reduce_mean"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"reduce_mean"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
...
...
@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_CL_RET_STATUS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
mace/kernels/opencl/resize_bilinear.cc
浏览文件 @
0d94aeae
...
...
@@ -25,25 +25,30 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
gws
[
2
]
/
8
;
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
gws
[
2
]
/
8
;
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
0d94aeae
...
...
@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/softmax.cc
浏览文件 @
0d94aeae
...
...
@@ -25,19 +25,23 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
lws
[
0
]
=
gws
[
0
];
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
lws
[
0
]
=
gws
[
0
]
/
base
;
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
lws
[
0
]
=
gws
[
0
];
}
else
{
lws
[
0
]
=
gws
[
0
]
/
base
;
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
1
);
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
1
);
return
lws
;
}
...
...
@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"softmax_opencl_kernel"
,
batch
,
height
,
width
,
channels
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/space_to_batch.cc
浏览文件 @
0d94aeae
...
...
@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
Concat
(
kernel_name
,
batch_tensor
->
dim
(
0
),
batch_tensor
->
dim
(
1
),
batch_tensor
->
dim
(
2
),
batch_tensor
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
0d94aeae
...
...
@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat
(
"winograd_inverse_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
),
input_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/libmace/mace.cc
浏览文件 @
0d94aeae
...
...
@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
MACE_CHECK
(
ret
==
0
,
"Failed to unmap model data file, error code: "
,
strerror
(
errno
));
}
#ifdef MACE_ENABLE_OPENCL
MaceStatus
CheckGPUAvalibility
(
const
NetDef
*
net_def
)
{
// Check OpenCL avaliable
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
!
runtime
->
is_opencl_avaliable
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
// Check whether model max OpenCL image sizes exceed OpenCL limitation.
if
(
net_def
==
nullptr
)
{
return
MaceStatus
::
MACE_INVALID_ARGS
;
}
if
(
!
runtime
->
IsImageSupport
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
auto
opencl_max_image_size
=
runtime
->
GetMaxImage2DSize
();
if
(
opencl_max_image_size
.
empty
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
const
std
::
vector
<
int64_t
>
net_max_image_size
=
ProtoArgHelper
::
GetRepeatedArgs
<
NetDef
,
int64_t
>
(
*
net_def
,
"opencl_max_image_size"
,
{
0
,
0
});
if
(
static_cast
<
uint64_t
>
(
net_max_image_size
[
0
])
>
opencl_max_image_size
[
0
]
||
static_cast
<
uint64_t
>
(
net_max_image_size
[
1
])
>
opencl_max_image_size
[
1
])
{
LOG
(
INFO
)
<<
"opencl max image size "
<<
MakeString
(
opencl_max_image_size
)
<<
" vs "
<<
MakeString
(
net_max_image_size
);
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
return
MaceStatus
::
MACE_SUCCESS
;
}
#endif
}
// namespace
// Mace Tensor
...
...
@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
const
std
::
vector
<
std
::
string
>
&
output_nodes
,
const
unsigned
char
*
model_data
)
{
LOG
(
INFO
)
<<
"Initializing MaceEngine"
;
// Check avalibility
#ifdef MACE_ENABLE_OPENCL
if
(
device_type_
==
DeviceType
::
GPU
)
{
MACE_RETURN_IF_ERROR
(
CheckGPUAvalibility
(
net_def
));
}
#endif
// Get input and output information.
for
(
auto
&
input_info
:
net_def
->
input_info
())
{
input_info_map_
[
input_info
.
name
()]
=
input_info
;
...
...
mace/proto/mace.proto
浏览文件 @
0d94aeae
...
...
@@ -20,6 +20,12 @@ enum DataType {
DT_INT32
=
4
;
}
enum
MemoryType
{
CPU_BUFFER
=
0
;
GPU_BUFFER
=
1
;
GPU_IMAGE
=
2
;
}
message
ConstTensor
{
repeated
int64
dims
=
1
;
optional
DataType
data_type
=
2
[
default
=
DT_FLOAT
];
...
...
@@ -73,8 +79,9 @@ message OperatorDef {
// for memory optimization
message
MemoryBlock
{
optional
int32
mem_id
=
1
;
optional
uint32
x
=
2
;
optional
uint32
y
=
3
;
optional
MemoryType
mem_type
=
2
;
optional
uint32
x
=
3
;
optional
uint32
y
=
4
;
}
message
MemoryArena
{
repeated
MemoryBlock
mem_block
=
1
;
...
...
mace/public/mace_runtime.h
浏览文件 @
0d94aeae
...
...
@@ -51,6 +51,7 @@ class KVStorage {
public:
// return: 0 for success, -1 for error
virtual
int
Load
()
=
0
;
virtual
void
Clear
()
=
0
;
virtual
bool
Insert
(
const
std
::
string
&
key
,
const
std
::
vector
<
unsigned
char
>
&
value
)
=
0
;
virtual
const
std
::
vector
<
unsigned
char
>
*
Find
(
const
std
::
string
&
key
)
=
0
;
...
...
@@ -78,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
std
::
unique_ptr
<
Impl
>
impl_
;
};
// Set Key-Value store factory. (Call Once)
// Now KVStorage is used to store the built OpenCL binaries to file,
// which could speed up the GPU initialization and first run.
// If do not call this API, the initialization maybe slow for GPU.
/// \brief Set internal storage factory to store internal data. (Call once)
///
/// Now the path is used to store the built OpenCL binaries to file,
/// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetKVStorageFactory
(
std
::
shared_ptr
<
KVStorageFactory
>
storage_factory
);
// Just call once. (Not thread-safe)
// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)
// if you use gpu of specific soc.
// Using OpenCL binary will speed up the initialization.
// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed.
/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
///
/// Just call once. (Not thread-safe)
/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
/// OpenCL binary is corresponding to the OpenCL Driver version,
/// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetOpenCLBinaryPaths
(
const
std
::
vector
<
std
::
string
>
&
paths
);
// Just call once. (Not thread-safe)
// Set the path of Generated OpenCL parameter file
// if you use gpu for specific soc.
// The parameters is the local work group size tuned for specific SOC, which
// may be faster than the general parameters.
/// \brief Set the path of Generated OpenCL parameter file
///
/// Just call once. (Not thread-safe)
/// If you use gpu for specific soc, The parameters is the local work group
/// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetOpenCLParameterPath
(
const
std
::
string
&
path
);
// Set GPU hints, currently only supports Adreno GPU.
//
// Caution: this function may hurt performance if improper parameters provided.
/// \brief Set GPU hints, currently only supports Adreno GPU.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return
__attribute__
((
visibility
(
"default"
)))
void
SetGPUHints
(
GPUPerfHint
perf_hint
,
GPUPriorityHint
priority_hint
);
// Set OpenMP threads number and affinity policy.
//
// Caution: this function may hurt performance if improper parameters provided.
//
// num_threads_hint is only a hint. When num_threads_hint is zero or negative,
// the function will set the threads number equaling to the number of
// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
// (AFFINITY_NONE) cores according to the policy. The threads number will
// also be truncated to the corresponding cores number when num_threads_hint
// is larger than it.
//
// The OpenMP threads will be bind to (via sched_setaffinity) big cores
// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
//
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
// suggested to use AFFINITY_NONE to use all cores.
/// \brief Set OpenMP threads number and affinity policy.
///
/// Caution: this function may hurt performance if improper parameters provided.
/// When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
/// \param status MACE_SUCCESS for successful, or it can't reliabley
/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
/// suggested to use AFFINITY_NONE to use all cores.
/// \return
__attribute__
((
visibility
(
"default"
)))
MaceStatus
SetOpenMPThreadPolicy
(
int
num_threads_hint
,
CPUAffinityPolicy
policy
);
// Set OpenMP threads number and processor affinity.
//
// Caution: this function may hurt performance if improper parameters provided.
//
// This function may not work well on some chips (e.g. MTK). Setting thread
// affinity to offline cores may run very slow or unexpectedly. In such cases,
// please use SetOpenMPThreadPolicy with default policy instead.
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \param status
/// \return
__attribute__
((
visibility
(
"default"
)))
MaceStatus
SetOpenMPThreadAffinity
(
int
num_threads
,
const
std
::
vector
<
int
>
&
cpu_ids
);
// Get ARM big.LITTLE configuration.
//
// This function will detect the max frequencies of all CPU cores, and assume
// the cores with largest max frequencies as big cores, and all the remaining
// cores as little. If all cpu core's max frequencies equals, big_core_ids and
// little_core_ids will both be filled with all cpu core ids.
//
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
// detect the frequency of big-LITTLE cores (e.g. MTK).
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
__attribute__
((
visibility
(
"default"
)))
MaceStatus
GetBigLittleCoreIDs
(
std
::
vector
<
int
>
*
big_core_ids
,
std
::
vector
<
int
>
*
little_core_ids
);
...
...
mace/python/tools/convert_util.py
浏览文件 @
0d94aeae
...
...
@@ -12,7 +12,72 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
enum
def
mace_check
(
condition
,
msg
):
if
not
condition
:
raise
Exception
(
msg
)
def
roundup_div4
(
value
):
return
int
((
value
+
3
)
/
4
)
class
OpenCLBufferType
(
enum
.
Enum
):
CONV2D_FILTER
=
0
IN_OUT_CHANNEL
=
1
ARGUMENT
=
2
IN_OUT_HEIGHT
=
3
IN_OUT_WIDTH
=
4
WINOGRAD_FILTER
=
5
DW_CONV2D_FILTER
=
6
WEIGHT_HEIGHT
=
7
WEIGHT_WIDTH
=
8
def
calculate_image_shape
(
buffer_type
,
shape
,
winograd_blk_size
=
0
):
# keep the same with mace/kernel/opencl/helper.cc
image_shape
=
[
0
,
0
]
if
buffer_type
==
OpenCLBufferType
.
CONV2D_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Conv2D filter buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
1
]
image_shape
[
1
]
=
shape
[
2
]
*
shape
[
3
]
*
roundup_div4
(
shape
[
0
])
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_CHANNEL
:
mace_check
(
len
(
shape
)
==
4
,
"Conv2D input/output buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
3
])
*
shape
[
2
]
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
]
elif
buffer_type
==
OpenCLBufferType
.
ARGUMENT
:
mace_check
(
len
(
shape
)
==
1
,
"Argument buffer should be 1D not "
+
str
(
shape
))
image_shape
[
0
]
=
roundup_div4
(
shape
[
0
])
image_shape
[
1
]
=
1
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_HEIGHT
:
mace_check
(
len
(
shape
)
==
4
,
"Input/output buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
*
roundup_div4
(
shape
[
1
])
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_WIDTH
:
mace_check
(
len
(
shape
)
==
4
,
"Input/output buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
2
])
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
]
elif
buffer_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Winograd filter buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
1
])
image_shape
[
1
]
=
(
shape
[
0
]
*
(
winograd_blk_size
+
2
)
*
(
winograd_blk_size
+
2
))
elif
buffer_type
==
OpenCLBufferType
.
DW_CONV2D_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Winograd filter buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
roundup_div4
(
shape
[
1
])
elif
buffer_type
==
OpenCLBufferType
.
WEIGHT_HEIGHT
:
mace_check
(
len
(
shape
)
==
4
,
"Weight buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
1
]
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
roundup_div4
(
shape
[
0
])
elif
buffer_type
==
OpenCLBufferType
.
WEIGHT_WIDTH
:
mace_check
(
len
(
shape
)
==
4
,
"Weight buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
1
])
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
else
:
mace_check
(
False
,
"OpenCL Image do not support type "
+
str
(
buffer_type
))
return
image_shape
mace/python/tools/converter.py
浏览文件 @
0d94aeae
...
...
@@ -171,6 +171,13 @@ def main(unused_args):
output_graph_def
.
op
.
extend
(
cpu_graph_def
.
op
)
output_graph_def
.
mem_arena
.
mem_block
.
extend
(
cpu_graph_def
.
mem_arena
.
mem_block
)
output_graph_arg_names
=
set
()
for
arg
in
output_graph_def
.
arg
:
output_graph_arg_names
.
add
(
arg
.
name
)
for
arg
in
cpu_graph_def
.
arg
:
if
arg
.
name
not
in
output_graph_arg_names
:
output_graph_def
.
arg
.
extend
(
arg
)
print
"Merge done"
else
:
option
.
device
=
device_type_map
[
FLAGS
.
runtime
]
...
...
mace/python/tools/converter_tool/base_converter.py
浏览文件 @
0d94aeae
...
...
@@ -163,6 +163,7 @@ class MaceKeyword(object):
mace_op_data_type_str
=
'T'
mace_offset_str
=
'offset'
mace_from_caffe_str
=
'from_caffe'
mace_opencl_max_image_size
=
"opencl_max_image_size"
class
TransformerRule
(
Enum
):
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
0d94aeae
...
...
@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
from
mace.python.tools.converter_tool.base_converter
import
MaceOp
from
mace.python.tools.converter_tool.base_converter
import
PaddingMode
from
mace.python.tools.converter_tool.base_converter
import
TransformerRule
from
mace.python.tools.convert_util
import
calculate_image_shape
from
mace.python.tools.convert_util
import
mace_check
OPENCL_IMAGE_MAX_SIZE
=
16384
from
mace.python.tools.convert_util
import
OpenCLBufferType
class
OpenCLBufferType
(
enum
.
Enum
):
CONV2D_FILTER
=
0
IN_OUT_CHANNEL
=
1
ARGUMENT
=
2
IN_OUT_HEIGHT
=
3
IN_OUT_WIDTH
=
4
WINOGRAD_FILTER
=
5
DW_CONV2D_FILTER
=
6
WEIGHT_HEIGHT
=
7
WEIGHT_WIDTH
=
8
OPENCL_IMAGE_MAX_SIZE
=
16384
class
Transformer
(
base_converter
.
ConverterInterface
):
...
...
@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
self
.
_producer
=
{}
self
.
_target_data_format
=
DataFormat
.
NHWC
self
.
_input_output_added
=
False
self
.
_opencl_max_image_size
=
[
0
,
0
]
if
self
.
_option
.
device
==
DeviceType
.
CPU
.
value
:
self
.
_target_data_format
=
DataFormat
.
NCHW
...
...
@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
arg
.
name
=
MaceKeyword
.
mace_mode
arg
.
i
=
0
tensor_shape
=
list
(
self
.
_consts
[
input_name
].
dims
)
if
input_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
blk_sqr
=
op
.
output_shape
[
0
].
dims
[
0
]
wino_blk
=
int
(
np
.
sqrt
(
blk_sqr
))
-
2
wino_arg
=
op_def
.
arg
.
add
()
wino_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
wino_arg
.
i
=
wino_blk
img_shape
=
calculate_image_shape
(
input_type
,
tensor_shape
,
wino_blk
)
else
:
img_shape
=
calculate_image_shape
(
input_type
,
tensor_shape
)
op
.
input
[
input_idx
]
=
output_name
# update OpenCL max image size
self
.
_opencl_max_image_size
[
0
]
=
max
(
self
.
_opencl_max_image_size
[
0
],
img_shape
[
0
])
self
.
_opencl_max_image_size
[
1
]
=
max
(
self
.
_opencl_max_image_size
[
1
],
img_shape
[
1
])
def
transform_buffer_image
(
self
):
if
self
.
_option
.
device
!=
DeviceType
.
GPU
.
value
:
return
False
...
...
@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
MaceKeyword
.
mace_activation_type_str
).
s
==
ActivationType
.
PRELU
.
name
:
# noqa
self
.
buffer_to_image
(
op
,
1
,
OpenCLBufferType
.
ARGUMENT
)
# Add OpenCL max image size
arg
=
net
.
arg
.
add
()
arg
.
name
=
MaceKeyword
.
mace_opencl_max_image_size
arg
.
ints
.
extend
(
self
.
_opencl_max_image_size
)
for
input_node
in
self
.
_option
.
input_nodes
.
values
():
new_input_name
=
MaceKeyword
.
mace_input_node_name
\
+
'_'
+
input_node
.
name
...
...
mace/python/tools/memory_optimizer.py
浏览文件 @
0d94aeae
...
...
@@ -16,6 +16,24 @@ import sys
import
operator
from
mace.proto
import
mace_pb2
from
mace.python.tools.converter_tool
import
base_converter
as
cvt
from
mace.python.tools.convert_util
import
calculate_image_shape
from
mace.python.tools.convert_util
import
OpenCLBufferType
class
MemoryBlock
(
object
):
def
__init__
(
self
,
mem_type
,
block
):
self
.
_mem_type
=
mem_type
self
.
_block
=
block
@
property
def
mem_type
(
self
):
return
self
.
_mem_type
@
property
def
block
(
self
):
return
self
.
_block
class
MemoryOptimizer
(
object
):
def
__init__
(
self
,
net_def
):
...
...
@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
self
.
op_mem
=
{}
# op_name->mem_id
self
.
mem_block
=
{}
# mem_id->[size] or mem_id->[x, y]
self
.
total_mem_count
=
0
self
.
total_cpu_mem_count
=
0
self
.
input_ref_counter
=
{}
self
.
mem_ref_counter
=
{}
...
...
@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
return
True
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
return
[
reduce
(
operator
.
mul
,
output_shape
,
1
)]
return
MemoryBlock
(
mace_pb2
.
CPU_BUFFER
,
[
reduce
(
operator
.
mul
,
output_shape
,
1
)])
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
return
memory_block
.
block
[
0
]
def
sub_mem_block
(
self
,
mem_block1
,
mem_block2
):
return
self
.
mem_size
(
mem_block1
)
-
self
.
mem_size
(
mem_block2
)
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
return
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
])]
return
MemoryBlock
(
old_mem_block
.
mem_type
,
[
max
(
old_mem_block
.
block
[
0
],
op_mem_block
.
block
[
0
])])
def
add_net_mem_blocks
(
self
):
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
mem_type
=
self
.
mem_block
[
mem
].
mem_type
block
.
x
=
self
.
mem_block
[
mem
].
block
[
0
]
block
.
y
=
1
def
get_total_origin_mem_size
(
self
):
...
...
@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
def
get_total_optimized_mem_size
(
self
):
optimized_mem_size
=
0
for
mem
in
self
.
mem_block
:
print
mem
,
self
.
mem_block
[
mem
]
print
mem
,
self
.
mem_block
[
mem
]
.
mem_type
,
self
.
mem_block
[
mem
].
block
optimized_mem_size
+=
self
.
mem_size
(
self
.
mem_block
[
mem
])
return
optimized_mem_size
...
...
@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
best_mem_waste_size
=
sys
.
maxint
for
mid
in
self
.
idle_mem
:
old_mem_block
=
self
.
mem_block
[
mid
]
if
old_mem_block
.
mem_type
!=
op_mem_block
.
mem_type
:
continue
new_mem_block
=
self
.
resize_mem_block
(
old_mem_block
,
op_mem_block
)
add_mem_size
=
self
.
sub_mem_block
(
new_mem_block
,
...
...
@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
for
arg
in
op
.
arg
:
if
arg
.
name
==
'mode'
and
arg
.
i
==
0
:
return
False
elif
op
.
type
==
'Shape'
:
for
i
in
range
(
len
(
op
.
output
)):
mem_id
=
self
.
total_cpu_mem_count
self
.
total_cpu_mem_count
+=
1
op_mem_block
=
self
.
get_op_mem_block
(
op
.
type
,
op
.
output_shape
[
i
].
dims
)
self
.
mem_block
[
mem_id
]
=
op_mem_block
return
False
return
op
.
type
!=
'ImageToBuffer'
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
mem_block
=
[
0
,
0
]
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
mem_block
[
0
]
=
output_shape
[
2
]
mem_block
[
1
]
=
output_shape
[
0
]
*
int
((
output_shape
[
1
]
+
3
)
/
4
)
buffer_shape
=
list
(
output_shape
)
+
[
1
]
mem_block
=
MemoryBlock
(
mace_pb2
.
GPU_IMAGE
,
calculate_image_shape
(
OpenCLBufferType
.
IN_OUT_HEIGHT
,
buffer_shape
))
elif
op_type
==
'Shape'
:
mem_block
[
0
]
=
output_shape
[
0
]
mem_block
[
1
]
=
1
mem_block
=
MemoryBlock
(
mace_pb2
.
CPU_BUFFER
,
[
output_shape
[
0
],
1
])
else
:
if
len
(
output_shape
)
==
2
:
# only support fc/softmax
mem_block
[
0
]
=
int
((
output_shape
[
1
]
+
3
)
/
4
)
mem_block
[
1
]
=
output_shape
[
0
]
buffer_shape
=
[
output_shape
[
0
],
1
,
1
,
output_shape
[
1
]]
elif
len
(
output_shape
)
==
4
:
mem_block
[
0
]
=
output_shape
[
2
]
*
int
((
output_shape
[
3
]
+
3
)
/
4
)
mem_block
[
1
]
=
output_shape
[
0
]
*
output_shape
[
1
]
buffer_shape
=
output_shape
else
:
raise
Exception
(
'output shape dim size is not 2 or 4.'
)
mem_block
=
MemoryBlock
(
mace_pb2
.
GPU_IMAGE
,
calculate_image_shape
(
OpenCLBufferType
.
IN_OUT_CHANNEL
,
buffer_shape
))
return
mem_block
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
*
memory_block
[
1
]
*
4
if
memory_block
.
mem_type
==
mace_pb2
.
GPU_IMAGE
:
return
memory_block
.
block
[
0
]
*
memory_block
.
block
[
1
]
*
4
else
:
return
memory_block
.
block
[
0
]
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
resize_mem_block
=
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
]),
max
(
old_mem_block
[
1
],
op_mem_block
[
1
])
]
resize_mem_block
=
MemoryBlock
(
old_mem_block
.
mem_type
,
[
max
(
old_mem_block
.
block
[
0
],
op_mem_block
.
block
[
0
]),
max
(
old_mem_block
.
block
[
1
],
op_mem_block
.
block
[
1
])
])
return
resize_mem_block
def
add_net_mem_blocks
(
self
):
max_image_size_x
=
0
max_image_size_y
=
0
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
y
=
self
.
mem_block
[
mem
][
1
]
block
.
mem_type
=
self
.
mem_block
[
mem
].
mem_type
block
.
x
=
self
.
mem_block
[
mem
].
block
[
0
]
block
.
y
=
self
.
mem_block
[
mem
].
block
[
1
]
if
self
.
mem_block
[
mem
].
mem_type
==
mace_pb2
.
GPU_IMAGE
:
max_image_size_x
=
max
(
max_image_size_x
,
block
.
x
)
max_image_size_y
=
max
(
max_image_size_y
,
block
.
y
)
# Update OpenCL max image size
net_ocl_max_img_size_arg
=
None
for
arg
in
self
.
net_def
.
arg
:
if
arg
.
name
==
cvt
.
MaceKeyword
.
mace_opencl_max_image_size
:
net_ocl_max_img_size_arg
=
arg
max_image_size_x
=
max
(
arg
.
ints
[
0
],
max_image_size_x
)
max_image_size_y
=
max
(
arg
.
ints
[
1
],
max_image_size_y
)
break
if
net_ocl_max_img_size_arg
is
None
:
net_ocl_max_img_size_arg
=
self
.
net_def
.
arg
.
add
()
net_ocl_max_img_size_arg
.
name
=
\
cvt
.
MaceKeyword
.
mace_opencl_max_image_size
net_ocl_max_img_size_arg
.
ints
[:]
=
[
max_image_size_x
,
max_image_size_y
]
def
mem_id_base
(
self
):
return
20000
...
...
mace/python/tools/model.jinja2
浏览文件 @
0d94aeae
...
...
@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
...
...
mace/test/mace_api_mt_test.cc
浏览文件 @
0d94aeae
...
...
@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
res
[
input_names
[
i
]]
=
mem_id
;
...
...
@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
res
[
output_names
[
i
]]
=
mem_id
;
...
...
mace/test/mace_api_test.cc
浏览文件 @
0d94aeae
...
...
@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
res
[
input_names
[
i
]]
=
mem_id
;
...
...
@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
res
[
output_names
[
i
]]
=
mem_id
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录