Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
88120708
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
88120708
编写于
7月 28, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add gpu avalibility check and return status to user if gpu call failed.
上级
d9a58a5e
变更
47
展开全部
显示空白变更内容
内联
并排
Showing
47 changed file
with
1161 addition
and
560 deletion
+1161
-560
mace/core/file_storage.cc
mace/core/file_storage.cc
+2
-2
mace/core/runtime/opencl/opencl_allocator.cc
mace/core/runtime/opencl/opencl_allocator.cc
+11
-4
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+105
-28
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+21
-6
mace/core/runtime/opencl/opencl_wrapper.cc
mace/core/runtime/opencl/opencl_wrapper.cc
+319
-161
mace/core/workspace.cc
mace/core/workspace.cc
+22
-20
mace/examples/cli/example.cc
mace/examples/cli/example.cc
+4
-1
mace/kernels/opencl/activation.cc
mace/kernels/opencl/activation.cc
+4
-2
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+4
-2
mace/kernels/opencl/batch_norm.cc
mace/kernels/opencl/batch_norm.cc
+4
-2
mace/kernels/opencl/bias_add.cc
mace/kernels/opencl/bias_add.cc
+3
-2
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+5
-3
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+5
-3
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+19
-10
mace/kernels/opencl/conv_2d_1x1.cc
mace/kernels/opencl/conv_2d_1x1.cc
+31
-23
mace/kernels/opencl/conv_2d_3x3.cc
mace/kernels/opencl/conv_2d_3x3.cc
+27
-20
mace/kernels/opencl/conv_2d_general.cc
mace/kernels/opencl/conv_2d_general.cc
+31
-24
mace/kernels/opencl/crop.cc
mace/kernels/opencl/crop.cc
+16
-8
mace/kernels/opencl/deconv_2d_opencl.cc
mace/kernels/opencl/deconv_2d_opencl.cc
+4
-2
mace/kernels/opencl/depth_to_space.cc
mace/kernels/opencl/depth_to_space.cc
+6
-3
mace/kernels/opencl/depthwise_conv.cc
mace/kernels/opencl/depthwise_conv.cc
+29
-21
mace/kernels/opencl/eltwise.cc
mace/kernels/opencl/eltwise.cc
+4
-2
mace/kernels/opencl/fully_connected.cc
mace/kernels/opencl/fully_connected.cc
+8
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+38
-27
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+11
-11
mace/kernels/opencl/image_to_buffer.cc
mace/kernels/opencl/image_to_buffer.cc
+6
-4
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+4
-2
mace/kernels/opencl/out_of_range_check_test.cc
mace/kernels/opencl/out_of_range_check_test.cc
+11
-4
mace/kernels/opencl/pad.cc
mace/kernels/opencl/pad.cc
+4
-2
mace/kernels/opencl/pooling.cc
mace/kernels/opencl/pooling.cc
+22
-13
mace/kernels/opencl/reduce_mean_opencl.cc
mace/kernels/opencl/reduce_mean_opencl.cc
+8
-4
mace/kernels/opencl/resize_bilinear.cc
mace/kernels/opencl/resize_bilinear.cc
+28
-19
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+5
-2
mace/kernels/opencl/softmax.cc
mace/kernels/opencl/softmax.cc
+18
-12
mace/kernels/opencl/space_to_batch.cc
mace/kernels/opencl/space_to_batch.cc
+6
-3
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+12
-6
mace/libmace/mace.cc
mace/libmace/mace.cc
+44
-0
mace/proto/mace.proto
mace/proto/mace.proto
+9
-2
mace/public/mace_runtime.h
mace/public/mace_runtime.h
+76
-51
mace/python/tools/convert_util.py
mace/python/tools/convert_util.py
+65
-0
mace/python/tools/converter.py
mace/python/tools/converter.py
+7
-0
mace/python/tools/converter_tool/base_converter.py
mace/python/tools/converter_tool/base_converter.py
+1
-0
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+20
-12
mace/python/tools/memory_optimizer.py
mace/python/tools/memory_optimizer.py
+77
-31
mace/python/tools/model.jinja2
mace/python/tools/model.jinja2
+1
-0
mace/test/mace_api_mt_test.cc
mace/test/mace_api_mt_test.cc
+2
-0
mace/test/mace_api_test.cc
mace/test/mace_api_test.cc
+2
-0
未找到文件。
mace/core/file_storage.cc
浏览文件 @
88120708
...
@@ -37,7 +37,7 @@ int FileStorage::Load() {
...
@@ -37,7 +37,7 @@ int FileStorage::Load() {
struct
stat
st
;
struct
stat
st
;
if
(
stat
(
file_path_
.
c_str
(),
&
st
)
==
-
1
)
{
if
(
stat
(
file_path_
.
c_str
(),
&
st
)
==
-
1
)
{
if
(
errno
==
ENOENT
)
{
if
(
errno
==
ENOENT
)
{
LOG
(
INFO
)
<<
"File "
<<
file_path_
VLOG
(
1
)
<<
"File "
<<
file_path_
<<
" does not exist"
;
<<
" does not exist"
;
return
0
;
return
0
;
}
else
{
}
else
{
...
...
mace/core/runtime/opencl/opencl_allocator.cc
浏览文件 @
88120708
...
@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
...
@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void
*
mapped_ptr
=
void
*
mapped_ptr
=
queue
.
enqueueMapBuffer
(
*
cl_buffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
queue
.
enqueueMapBuffer
(
*
cl_buffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
offset
,
nbytes
,
nullptr
,
nullptr
,
&
error
);
offset
,
nbytes
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Map buffer failed, error: "
<<
OpenCLErrorToString
(
error
);
mapped_ptr
=
nullptr
;
}
return
mapped_ptr
;
return
mapped_ptr
;
}
}
...
@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
...
@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
*
cl_image
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
origin
,
region
,
*
cl_image
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
origin
,
region
,
mapped_image_pitch
->
data
(),
mapped_image_pitch
->
data
()
+
1
,
nullptr
,
mapped_image_pitch
->
data
(),
mapped_image_pitch
->
data
()
+
1
,
nullptr
,
nullptr
,
&
error
);
nullptr
,
&
error
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Map Image failed, error: "
<<
OpenCLErrorToString
(
error
);
mapped_ptr
=
nullptr
;
}
return
mapped_ptr
;
return
mapped_ptr
;
}
}
...
@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
...
@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto
queue
=
OpenCLRuntime
::
Global
()
->
command_queue
();
auto
queue
=
OpenCLRuntime
::
Global
()
->
command_queue
();
cl_int
error
=
queue
.
enqueueUnmapMemObject
(
*
cl_buffer
,
mapped_ptr
,
cl_int
error
=
queue
.
enqueueUnmapMemObject
(
*
cl_buffer
,
mapped_ptr
,
nullptr
,
nullptr
);
nullptr
,
nullptr
);
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"Unmap buffer failed, error: "
<<
OpenCLErrorToString
(
error
);
}
}
}
bool
OpenCLAllocator
::
OnHost
()
const
{
return
false
;
}
bool
OpenCLAllocator
::
OnHost
()
const
{
return
false
;
}
...
...
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
88120708
...
@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
...
@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
OpenCLRuntime
::
OpenCLRuntime
()
:
OpenCLRuntime
::
OpenCLRuntime
()
:
precompiled_binary_storage_
(
nullptr
),
precompiled_binary_storage_
(
nullptr
),
cache_storage_
(
nullptr
),
cache_storage_
(
nullptr
),
is_profiling_enabled_
(
false
)
{
is_opencl_avaliable_
(
false
),
is_profiling_enabled_
(
false
),
opencl_version_
(
CL_VER_UNKNOWN
),
gpu_type_
(
UNKNOWN
)
{
std
::
vector
<
cl
::
Platform
>
all_platforms
;
std
::
vector
<
cl
::
Platform
>
all_platforms
;
cl
::
Platform
::
get
(
&
all_platforms
);
cl
::
Platform
::
get
(
&
all_platforms
);
if
(
all_platforms
.
size
()
==
0
)
{
if
(
all_platforms
.
size
()
==
0
)
{
LOG
(
FATAL
)
<<
"No OpenCL platforms found"
;
LOG
(
ERROR
)
<<
"No OpenCL platforms found"
;
return
;
}
}
cl
::
Platform
default_platform
=
all_platforms
[
0
];
cl
::
Platform
default_platform
=
all_platforms
[
0
];
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
...
@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
std
::
vector
<
cl
::
Device
>
all_devices
;
std
::
vector
<
cl
::
Device
>
all_devices
;
default_platform
.
getDevices
(
CL_DEVICE_TYPE_ALL
,
&
all_devices
);
default_platform
.
getDevices
(
CL_DEVICE_TYPE_ALL
,
&
all_devices
);
if
(
all_devices
.
size
()
==
0
)
{
if
(
all_devices
.
size
()
==
0
)
{
LOG
(
FATAL
)
<<
"No OpenCL devices found"
;
LOG
(
ERROR
)
<<
"No OpenCL devices found"
;
return
;
}
}
bool
gpu_detected
=
false
;
bool
gpu_detected
=
false
;
...
@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
...
@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
const
std
::
string
device_version
=
device
.
getInfo
<
CL_DEVICE_VERSION
>
();
const
std
::
string
device_version
=
device
.
getInfo
<
CL_DEVICE_VERSION
>
();
opencl_version_
=
ParseDeviceVersion
(
device_version
);
opencl_version_
=
ParseDeviceVersion
(
device_version
);
if
(
opencl_version_
==
OpenCLVersion
::
CL_VER_UNKNOWN
)
{
return
;
}
VLOG
(
1
)
<<
"Using device: "
<<
device_name
;
VLOG
(
1
)
<<
"Using device: "
<<
device_name
;
break
;
break
;
}
}
}
}
if
(
!
gpu_detected
)
{
if
(
!
gpu_detected
)
{
LOG
(
FATAL
)
<<
"No GPU device found"
;
LOG
(
ERROR
)
<<
"No GPU device found"
;
return
;
}
}
cl_command_queue_properties
properties
=
0
;
cl_command_queue_properties
properties
=
0
;
...
@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
...
@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
new
cl
::
Context
({
*
device_
},
nullptr
,
nullptr
,
nullptr
,
&
err
));
new
cl
::
Context
({
*
device_
},
nullptr
,
nullptr
,
nullptr
,
&
err
));
}
}
}
}
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
;
}
command_queue_
=
std
::
make_shared
<
cl
::
CommandQueue
>
(
*
context_
,
command_queue_
=
std
::
make_shared
<
cl
::
CommandQueue
>
(
*
context_
,
*
device_
,
*
device_
,
properties
,
properties
,
&
err
);
&
err
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
;
}
extern
std
::
shared_ptr
<
KVStorageFactory
>
kStorageFactory
;
extern
std
::
shared_ptr
<
KVStorageFactory
>
kStorageFactory
;
std
::
string
cached_binary_platform_info
;
std
::
string
cached_binary_platform_info
;
...
@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
...
@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
}
}
if
(
cached_binary_platform_info
!=
platform_info_
)
{
if
(
cached_binary_platform_info
!=
platform_info_
)
{
if
(
OpenCLRuntime
::
kPrecompiledBinaryPath
.
empty
())
{
if
(
!
OpenCLRuntime
::
kPrecompiledBinaryPath
.
empty
())
{
LOG
(
WARNING
)
<<
"There is no precompiled OpenCL binary in"
" all OpenCL binary paths"
;
}
else
{
precompiled_binary_storage_
.
reset
(
precompiled_binary_storage_
.
reset
(
new
FileStorage
(
OpenCLRuntime
::
kPrecompiledBinaryPath
));
new
FileStorage
(
OpenCLRuntime
::
kPrecompiledBinaryPath
));
if
(
precompiled_binary_storage_
->
Load
()
!=
0
)
{
if
(
precompiled_binary_storage_
->
Load
()
!=
0
)
{
...
@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
...
@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
}
else
{
}
else
{
this
->
out_of_range_check_
=
false
;
this
->
out_of_range_check_
=
false
;
}
}
is_opencl_avaliable_
=
true
;
}
}
OpenCLRuntime
::~
OpenCLRuntime
()
{
OpenCLRuntime
::~
OpenCLRuntime
()
{
...
@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
...
@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
device_
.
reset
();
device_
.
reset
();
}
}
bool
OpenCLRuntime
::
is_opencl_avaliable
()
{
static
const
uint64_t
kMinWorkGroupSize
=
64
;
return
is_opencl_avaliable_
&&
GetDeviceMaxWorkGroupSize
()
>=
kMinWorkGroupSize
;
}
cl
::
Context
&
OpenCLRuntime
::
context
()
{
return
*
context_
;
}
cl
::
Context
&
OpenCLRuntime
::
context
()
{
return
*
context_
;
}
cl
::
Device
&
OpenCLRuntime
::
device
()
{
return
*
device_
;
}
cl
::
Device
&
OpenCLRuntime
::
device
()
{
return
*
device_
;
}
...
@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
...
@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return
true
;
return
true
;
}
}
void
OpenCLRuntime
::
BuildProgramFromSource
(
bool
OpenCLRuntime
::
BuildProgramFromSource
(
const
std
::
string
&
program_name
,
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
const
std
::
string
&
build_options_str
,
...
@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
...
@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
LOG
(
WARNING
)
<<
"Build program "
LOG
(
WARNING
)
<<
"Build program "
<<
program_name
<<
" from source failed: "
<<
program_name
<<
" from source failed: "
<<
MakeString
(
ret
);
<<
MakeString
(
ret
);
return
;
return
false
;
}
}
// Keep built program binary
// Keep built program binary
...
@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
...
@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
cl_int
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARY_SIZES
,
cl_int
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARY_SIZES
,
sizeof
(
size_t
)
*
device_list_size
,
sizeof
(
size_t
)
*
device_list_size
,
program_binary_sizes
.
get
(),
nullptr
);
program_binary_sizes
.
get
(),
nullptr
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
std
::
unique_ptr
<
std
::
unique_ptr
<
unsigned
char
[]
>
[]
>
program_binaries
(
std
::
unique_ptr
<
std
::
unique_ptr
<
unsigned
char
[]
>
[]
>
program_binaries
(
new
std
::
unique_ptr
<
unsigned
char
[]
>
[
device_list_size
]);
new
std
::
unique_ptr
<
unsigned
char
[]
>
[
device_list_size
]);
for
(
cl_uint
i
=
0
;
i
<
device_list_size
;
++
i
)
{
for
(
cl_uint
i
=
0
;
i
<
device_list_size
;
++
i
)
{
...
@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
...
@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARIES
,
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARIES
,
sizeof
(
unsigned
char
*
)
*
device_list_size
,
sizeof
(
unsigned
char
*
)
*
device_list_size
,
program_binaries
.
get
(),
nullptr
);
program_binaries
.
get
(),
nullptr
);
MACE_CHECK_CL_SUCCESS
(
err
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
std
::
vector
<
unsigned
char
>
content
(
std
::
vector
<
unsigned
char
>
content
(
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
()),
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
()),
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
())
+
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
())
+
...
@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
...
@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
VLOG
(
3
)
<<
"Program from source: "
<<
built_program_key
;
VLOG
(
3
)
<<
"Program from source: "
<<
built_program_key
;
}
}
return
true
;
}
}
void
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_name
,
bool
OpenCLRuntime
::
BuildProgram
(
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
)
{
cl
::
Program
*
program
)
{
...
@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
...
@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
ret
=
BuildProgramFromPrecompiledBinary
(
built_program_key
,
ret
=
BuildProgramFromPrecompiledBinary
(
built_program_key
,
build_options_str
,
program
);
build_options_str
,
program
);
if
(
!
ret
)
{
if
(
!
ret
)
{
BuildProgramFromSource
(
program_name
,
built_program_key
,
ret
=
BuildProgramFromSource
(
program_name
,
built_program_key
,
build_options_str
,
program
);
build_options_str
,
program
);
}
}
}
}
return
ret
;
}
}
cl
::
Kernel
OpenCLRuntime
::
BuildKernel
(
MaceStatus
OpenCLRuntime
::
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
)
{
const
std
::
set
<
std
::
string
>
&
build_options
,
cl
::
Kernel
*
kernel
)
{
std
::
string
build_options_str
;
std
::
string
build_options_str
;
for
(
auto
&
option
:
build_options
)
{
for
(
auto
&
option
:
build_options
)
{
build_options_str
+=
" "
+
option
;
build_options_str
+=
" "
+
option
;
...
@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
...
@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if
(
built_program_it
!=
built_program_map_
.
end
())
{
if
(
built_program_it
!=
built_program_map_
.
end
())
{
program
=
built_program_it
->
second
;
program
=
built_program_it
->
second
;
}
else
{
}
else
{
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
bool
ret
=
this
->
BuildProgram
(
program_name
,
built_program_key
,
&
program
);
build_options_str
,
&
program
);
if
(
!
ret
)
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
built_program_map_
.
emplace
(
built_program_key
,
program
);
built_program_map_
.
emplace
(
built_program_key
,
program
);
}
}
return
cl
::
Kernel
(
program
,
kernel_name
.
c_str
());
cl_int
err
;
*
kernel
=
cl
::
Kernel
(
program
,
kernel_name
.
c_str
(),
&
err
);
MACE_CL_RET_STATUS
(
err
);
return
MaceStatus
::
MACE_SUCCESS
;
}
}
void
OpenCLRuntime
::
SaveBuiltCLProgram
()
{
void
OpenCLRuntime
::
SaveBuiltCLProgram
()
{
...
@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
...
@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
uint64_t
OpenCLRuntime
::
GetDeviceMaxWorkGroupSize
()
{
uint64_t
OpenCLRuntime
::
GetDeviceMaxWorkGroupSize
()
{
uint64_t
size
=
0
;
uint64_t
size
=
0
;
device_
->
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_MAX_WORK_GROUP_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
return
size
;
}
}
uint64_t
OpenCLRuntime
::
GetDeviceMaxMemAllocSize
()
{
uint64_t
OpenCLRuntime
::
GetDeviceMaxMemAllocSize
()
{
uint64_t
size
=
0
;
uint64_t
size
=
0
;
device_
->
getInfo
(
CL_DEVICE_MAX_MEM_ALLOC_SIZE
,
&
size
);
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_MAX_MEM_ALLOC_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
return
size
;
}
}
bool
OpenCLRuntime
::
IsImageSupport
()
{
cl_bool
res
;
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE_SUPPORT
,
&
res
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
false
;
}
return
res
==
CL_TRUE
;
}
std
::
vector
<
uint64_t
>
OpenCLRuntime
::
GetMaxImage2DSize
()
{
size_t
max_height
,
max_width
;
cl_int
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE2D_MAX_HEIGHT
,
&
max_height
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
{};
}
err
=
device_
->
getInfo
(
CL_DEVICE_IMAGE2D_MAX_WIDTH
,
&
max_width
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
return
{};
}
return
{
max_height
,
max_width
};
}
uint64_t
OpenCLRuntime
::
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
OpenCLRuntime
::
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
size
=
0
;
uint64_t
size
=
0
;
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
cl_int
err
=
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WORK_GROUP_SIZE
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
return
size
;
}
}
uint64_t
OpenCLRuntime
::
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
OpenCLRuntime
::
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
)
{
uint64_t
size
=
0
;
uint64_t
size
=
0
;
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WAVE_SIZE_QCOM
,
&
size
);
cl_int
err
=
kernel
.
getWorkGroupInfo
(
*
device_
,
CL_KERNEL_WAVE_SIZE_QCOM
,
&
size
);
if
(
err
!=
CL_SUCCESS
)
{
LOG
(
ERROR
)
<<
"error: "
<<
OpenCLErrorToString
(
err
);
size
=
0
;
}
return
size
;
return
size
;
}
}
...
@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
...
@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
}
else
if
(
words
[
1
]
==
"1.0"
)
{
}
else
if
(
words
[
1
]
==
"1.0"
)
{
return
OpenCLVersion
::
CL_VER_1_0
;
return
OpenCLVersion
::
CL_VER_1_0
;
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
LOG
(
ERROR
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
return
OpenCLVersion
::
CL_VER_
1_0
;
return
OpenCLVersion
::
CL_VER_
UNKNOWN
;
}
}
}
}
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
88120708
...
@@ -42,13 +42,23 @@ enum OpenCLVersion {
...
@@ -42,13 +42,23 @@ enum OpenCLVersion {
CL_VER_1_1
,
CL_VER_1_1
,
CL_VER_1_2
,
CL_VER_1_2
,
CL_VER_2_0
,
CL_VER_2_0
,
CL_VER_UNKNOWN
,
};
};
const
std
::
string
OpenCLErrorToString
(
cl_int
error
);
const
std
::
string
OpenCLErrorToString
(
cl_int
error
);
#define MACE_CHECK_CL_SUCCESS(error) \
#define MACE_CL_RET_ERROR(error) \
MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error)
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return error; \
}
#define MACE_CL_RET_STATUS(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return MaceStatus::MACE_OUT_OF_RESOURCES; \
}
class
OpenCLProfilingTimer
:
public
Timer
{
class
OpenCLProfilingTimer
:
public
Timer
{
public:
public:
...
@@ -81,19 +91,23 @@ class OpenCLRuntime {
...
@@ -81,19 +91,23 @@ class OpenCLRuntime {
const
std
::
string
platform_info
()
const
;
const
std
::
string
platform_info
()
const
;
uint64_t
device_global_mem_cache_size
()
const
;
uint64_t
device_global_mem_cache_size
()
const
;
uint32_t
device_compute_units
()
const
;
uint32_t
device_compute_units
()
const
;
bool
is_opencl_avaliable
();
void
GetCallStats
(
const
cl
::
Event
&
event
,
CallStats
*
stats
);
void
GetCallStats
(
const
cl
::
Event
&
event
,
CallStats
*
stats
);
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetDeviceMaxMemAllocSize
();
uint64_t
GetDeviceMaxMemAllocSize
();
bool
IsImageSupport
();
std
::
vector
<
uint64_t
>
GetMaxImage2DSize
();
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
bool
IsNonUniformWorkgroupsSupported
()
const
;
bool
IsNonUniformWorkgroupsSupported
()
const
;
bool
IsOutOfRangeCheckEnabled
()
const
;
bool
IsOutOfRangeCheckEnabled
()
const
;
bool
is_profiling_enabled
()
const
;
bool
is_profiling_enabled
()
const
;
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
MaceStatus
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
const
std
::
set
<
std
::
string
>
&
build_options
,
cl
::
Kernel
*
kernel
);
void
SaveBuiltCLProgram
();
void
SaveBuiltCLProgram
();
...
@@ -103,7 +117,7 @@ class OpenCLRuntime {
...
@@ -103,7 +117,7 @@ class OpenCLRuntime {
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
void
BuildProgram
(
const
std
::
string
&
program_file_name
,
bool
BuildProgram
(
const
std
::
string
&
program_file_name
,
const
std
::
string
&
binary_file_name
,
const
std
::
string
&
binary_file_name
,
const
std
::
string
&
build_options
,
const
std
::
string
&
build_options
,
cl
::
Program
*
program
);
cl
::
Program
*
program
);
...
@@ -115,7 +129,7 @@ class OpenCLRuntime {
...
@@ -115,7 +129,7 @@ class OpenCLRuntime {
const
std
::
string
&
built_program_key
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
const
std
::
string
&
build_options_str
,
cl
::
Program
*
program
);
cl
::
Program
*
program
);
void
BuildProgramFromSource
(
bool
BuildProgramFromSource
(
const
std
::
string
&
program_name
,
const
std
::
string
&
program_name
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
const
std
::
string
&
build_options_str
,
...
@@ -125,6 +139,7 @@ class OpenCLRuntime {
...
@@ -125,6 +139,7 @@ class OpenCLRuntime {
private:
private:
std
::
unique_ptr
<
KVStorage
>
precompiled_binary_storage_
;
std
::
unique_ptr
<
KVStorage
>
precompiled_binary_storage_
;
std
::
unique_ptr
<
KVStorage
>
cache_storage_
;
std
::
unique_ptr
<
KVStorage
>
cache_storage_
;
bool
is_opencl_avaliable_
;
bool
is_profiling_enabled_
;
bool
is_profiling_enabled_
;
// All OpenCL object must be a pointer and manually deleted before unloading
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
// OpenCL library.
...
...
mace/core/runtime/opencl/opencl_wrapper.cc
浏览文件 @
88120708
此差异已折叠。
点击以展开。
mace/core/workspace.cc
浏览文件 @
88120708
...
@@ -204,28 +204,30 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
...
@@ -204,28 +204,30 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
// consistent with gpu, all memory block use float/half as unit
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
if
(
device_type
==
DeviceType
::
GPU
)
{
if
(
mem_block
.
mem_type
()
==
MemoryType
::
CPU_BUFFER
)
{
// TODO(liuqi): refactor based on PB
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
if
(
mem_block
.
mem_id
()
>=
20000
)
{
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
CPU
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
else
if
(
mem_block
.
mem_type
()
==
MemoryType
::
GPU_IMAGE
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
());
new
Image
());
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
std
::
move
(
image_buf
));
}
}
else
if
(
mem_block
.
mem_type
()
==
MemoryType
::
GPU_BUFFER
)
{
}
else
{
if
(
mem_block
.
mem_id
()
<
20000
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
device_type
)));
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
)));
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
MACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)));
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
std
::
move
(
tensor_buf
));
}
}
}
}
}
VLOG
(
3
)
<<
"Preallocate buffer to tensors"
;
VLOG
(
3
)
<<
"Preallocate buffer to tensors"
;
for
(
auto
&
op
:
net_def
.
op
())
{
for
(
auto
&
op
:
net_def
.
op
())
{
// TODO(liuqi): refactor based on PB
// TODO(liuqi): refactor based on PB
...
...
mace/examples/cli/example.cc
浏览文件 @
88120708
...
@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
...
@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
#endif
#endif
if
(
create_engine_status
!=
MaceStatus
::
MACE_SUCCESS
)
{
if
(
create_engine_status
!=
MaceStatus
::
MACE_SUCCESS
)
{
std
::
cerr
<<
"Create engine error, please check the arguments"
<<
std
::
endl
;
std
::
cerr
<<
"Create engine error, please check the arguments first, "
<<
"if correct, the device may not run the model, "
<<
"please fall back to other strategy."
<<
std
::
endl
;
exit
(
1
);
exit
(
1
);
}
}
...
...
mace/kernels/opencl/activation.cc
浏览文件 @
88120708
...
@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
...
@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
default:
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
...
@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
88120708
...
@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
...
@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
...
@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"addn_opencl_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
Concat
(
"addn_opencl_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
));
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/batch_norm.cc
浏览文件 @
88120708
...
@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
...
@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
}
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
...
@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel"
,
activation_
,
output
->
dim
(
0
),
Concat
(
"batch_norm_opencl_kernel"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/bias_add.cc
浏览文件 @
88120708
...
@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
88120708
...
@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
...
@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
}
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
cl
::
Kernel
b2f_kernel
;
obfuscated_kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
));
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
...
@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
88120708
...
@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
...
@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
...
@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
88120708
...
@@ -24,12 +24,18 @@ namespace kernels {
...
@@ -24,12 +24,18 @@ namespace kernels {
namespace
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
...
@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if
(
input0
->
dim
(
3
)
%
4
==
0
)
{
if
(
input0
->
dim
(
3
)
%
4
==
0
)
{
built_options
.
emplace
(
"-DDIVISIBLE_FOUR"
);
built_options
.
emplace
(
"-DDIVISIBLE_FOUR"
);
}
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
...
@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
...
@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
...
@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
...
@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/conv_2d_1x1.cc
浏览文件 @
88120708
...
@@ -27,7 +27,11 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
...
@@ -27,7 +27,11 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
const
uint32_t
lws_limit
=
128
;
const
uint32_t
lws_limit
=
128
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
...
@@ -45,12 +49,14 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -45,12 +49,14 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"conv2d_1x1_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/conv_2d_3x3.cc
浏览文件 @
88120708
...
@@ -26,7 +26,11 @@ namespace {
...
@@ -26,7 +26,11 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
5
+
4
+
5
)
*
4
*
4
;
const
uint32_t
kernel_cache_size
=
(
5
+
4
+
5
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
const
uint32_t
base
=
...
@@ -45,6 +49,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -45,6 +49,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"conv2d_3x3_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/conv_2d_general.cc
浏览文件 @
88120708
...
@@ -30,7 +30,11 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
...
@@ -30,7 +30,11 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const
uint32_t
kernel_size
,
const
uint32_t
kernel_size
,
const
uint32_t
kwg_size
)
{
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
const
uint32_t
base
=
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
...
@@ -54,6 +58,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
...
@@ -54,6 +58,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
...
@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
...
@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
2
),
filter
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
2
),
filter
->
dim
(
3
));
std
::
vector
<
uint32_t
>
lws
=
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
filter
->
dim
(
2
)
*
filter
->
dim
(
3
),
*
kwg_size
);
LocalWS
(
gws
,
filter
->
dim
(
2
)
*
filter
->
dim
(
3
),
*
kwg_size
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/crop.cc
浏览文件 @
88120708
...
@@ -24,12 +24,18 @@ namespace kernels {
...
@@ -24,12 +24,18 @@ namespace kernels {
namespace
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
...
@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"crop"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"crop"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
...
@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"crop_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"crop_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/deconv_2d_opencl.cc
浏览文件 @
88120708
...
@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
...
@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
*
kernel
=
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
...
@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"deconv2d_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"deconv2d_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depth_to_space.cc
浏览文件 @
88120708
...
@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
...
@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"depth_to_space"
,
built_options
);
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
...
@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
}
}
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depthwise_conv.cc
浏览文件 @
88120708
...
@@ -26,7 +26,11 @@ namespace {
...
@@ -26,7 +26,11 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
1
)
*
4
*
4
;
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
1
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
if
(
lws
[
1
]
>=
base
)
{
...
@@ -38,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -38,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
kwg_size
/
lws
[
1
]);
kwg_size
/
lws
[
1
]);
}
}
}
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]),
1
);
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]),
1
);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
gws
[
2
]);
...
@@ -47,6 +52,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -47,6 +52,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
*
kernel
=
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
,
kernel
));
*
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
multiplier
);
Concat
(
"depthwise_conv2d_ocl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/eltwise.cc
浏览文件 @
88120708
...
@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
...
@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
...
@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/fully_connected.cc
浏览文件 @
88120708
...
@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
...
@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
*
kernel
=
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
built_options
,
kernel
)
);
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
...
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
...
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
(
*
kernel_error
)
->
UnMap
();
}
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
future
!=
nullptr
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
...
@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
default:
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
*
kernel
=
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
,
kernel
));
uint32_t
kwg_size
=
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
...
@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
...
@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
3
));
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
88120708
...
@@ -245,6 +245,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
...
@@ -245,6 +245,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
...
@@ -254,10 +257,11 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
...
@@ -254,10 +257,11 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
MaceStatus
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
...
@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
MACE_CHECK
(
params
[
i
]
!=
0
);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
}
}
...
@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
}
}
}
else
{
}
else
{
timer
->
ClearTiming
();
timer
->
ClearTiming
();
...
@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NullRange
,
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
internal_gws
[
2
]),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
internal_gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
}
}
}
}
...
@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
return
error
;
return
error
;
};
};
OpenCLProfilingTimer
timer
(
&
event
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
cl_int
err
=
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
MACE_CL_RET_STATUS
(
err
);
if
(
future
!=
nullptr
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
event
](
CallStats
*
stats
)
{
future
->
wait_fn
=
[
event
](
CallStats
*
stats
)
{
...
@@ -388,9 +394,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -388,9 +394,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
}
};
};
}
}
return
MaceStatus
::
MACE_SUCCESS
;
}
}
void
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
MaceStatus
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
...
@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
MACE_CHECK
(
params
[
i
]
!=
0
);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
}
}
...
@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
}
}
}
else
{
}
else
{
timer
->
ClearTiming
();
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
]),
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_C
HECK_CL_SUCCESS
(
error
);
MACE_C
L_RET_ERROR
(
error
);
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
}
}
}
}
...
@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
return
error
;
return
error
;
};
};
OpenCLProfilingTimer
timer
(
&
event
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
cl_int
err
=
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
MACE_CL_RET_STATUS
(
err
);
if
(
future
!=
nullptr
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
event
.
wait
();
...
@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
}
};
};
}
}
return
MaceStatus
::
MACE_SUCCESS
;
}
}
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/opencl/helper.h
浏览文件 @
88120708
...
@@ -65,13 +65,13 @@ std::string DtToCLDt(const DataType dt);
...
@@ -65,13 +65,13 @@ std::string DtToCLDt(const DataType dt);
std
::
string
DtToUpstreamCLDt
(
const
DataType
dt
);
std
::
string
DtToUpstreamCLDt
(
const
DataType
dt
);
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
MaceStatus
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
StatsFuture
*
future
);
void
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
MaceStatus
TuningOrRun2DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
...
...
mace/kernels/opencl/image_to_buffer.cc
浏览文件 @
88120708
...
@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
...
@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
kernel_error_
->
UnMap
();
kernel_error_
->
UnMap
();
}
}
}
}
cl
::
Kernel
b2f_kernel
;
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
));
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
...
@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
88120708
...
@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
...
@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
...
@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
batch
,
height
,
width
);
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
batch
,
height
,
width
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/out_of_range_check_test.cc
浏览文件 @
88120708
...
@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
...
@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
kernel_error
->
UnMap
();
kernel_error
->
UnMap
();
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
cl
::
Kernel
b2f_kernel
;
obfuscated_kernel_name
,
built_options
);
cl_int
error
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
,
&
b2f_kernel
);
if
(
error
!=
CL_SUCCESS
)
{
return
false
;
}
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
...
@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
cl
::
Event
event
;
cl
::
Event
event
;
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
...
@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
...
@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
error
!=
CL_SUCCESS
)
{
return
false
;
}
runtime
->
command_queue
().
finish
();
runtime
->
command_queue
().
finish
();
bool
is_out_of_range
=
false
;
bool
is_out_of_range
=
false
;
...
...
mace/kernels/opencl/pad.cc
浏览文件 @
88120708
...
@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"pad"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"pad"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/pooling.cc
浏览文件 @
88120708
...
@@ -25,7 +25,11 @@ namespace {
...
@@ -25,7 +25,11 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
lws
[
2
]
=
...
@@ -37,6 +41,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -37,6 +41,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
}
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/reduce_mean_opencl.cc
浏览文件 @
88120708
...
@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
...
@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
kernel_error_
->
UnMap
();
}
}
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"reduce_mean"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"reduce_mean"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
...
@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
...
@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
}
MACE_CL_RET_STATUS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
kernel_error_
->
UnMap
();
}
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
mace/kernels/opencl/resize_bilinear.cc
浏览文件 @
88120708
...
@@ -25,7 +25,11 @@ namespace kernels {
...
@@ -25,7 +25,11 @@ namespace kernels {
namespace
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
if
(
lws
[
1
]
>=
base
)
{
...
@@ -44,6 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -44,6 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
}
}
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
...
@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
...
@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
88120708
...
@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
...
@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
...
@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
}
MACE_C
HECK_CL_SUCCES
S
(
error
);
MACE_C
L_RET_STATU
S
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/softmax.cc
浏览文件 @
88120708
...
@@ -25,9 +25,13 @@ namespace kernels {
...
@@ -25,9 +25,13 @@ namespace kernels {
namespace
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
if
(
kwg_size
==
0
)
{
lws
[
0
]
=
lws
[
1
]
=
lws
[
2
]
=
1
;
}
else
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
if
(
gws
[
0
]
<
base
)
{
lws
[
0
]
=
gws
[
0
];
lws
[
0
]
=
gws
[
0
];
...
@@ -35,9 +39,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
...
@@ -35,9 +39,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
lws
[
0
]
=
gws
[
0
]
/
base
;
lws
[
0
]
=
gws
[
0
]
/
base
;
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
1
);
1
);
}
return
lws
;
return
lws
;
}
}
...
@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
...
@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
...
@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
);
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"softmax_opencl_kernel"
,
batch
,
height
,
width
,
channels
);
Concat
(
"softmax_opencl_kernel"
,
batch
,
height
,
width
,
channels
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/space_to_batch.cc
浏览文件 @
88120708
...
@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
...
@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"space_to_batch"
,
built_options
);
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
...
@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
kernel_name
,
batch_tensor
->
dim
(
0
),
batch_tensor
->
dim
(
1
),
Concat
(
kernel_name
,
batch_tensor
->
dim
(
0
),
batch_tensor
->
dim
(
1
),
batch_tensor
->
dim
(
2
),
batch_tensor
->
dim
(
3
));
batch_tensor
->
dim
(
2
),
batch_tensor
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
88120708
...
@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
...
@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"winograd_transform"
,
built_options
);
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
...
@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
));
output_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
...
@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"winograd_transform"
,
built_options
);
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
...
@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat
(
"winograd_inverse_transform_kernel"
,
output_tensor
->
dim
(
0
),
Concat
(
"winograd_inverse_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
),
input_tensor
->
dim
(
2
));
output_tensor
->
dim
(
3
),
input_tensor
->
dim
(
2
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/libmace/mace.cc
浏览文件 @
88120708
...
@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
...
@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
MACE_CHECK
(
ret
==
0
,
"Failed to unmap model data file, error code: "
,
MACE_CHECK
(
ret
==
0
,
"Failed to unmap model data file, error code: "
,
strerror
(
errno
));
strerror
(
errno
));
}
}
#ifdef MACE_ENABLE_OPENCL
MaceStatus
CheckGPUAvalibility
(
const
NetDef
*
net_def
)
{
// Check OpenCL avaliable
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
!
runtime
->
is_opencl_avaliable
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
// Check whether model max OpenCL image sizes exceed OpenCL limitation.
if
(
net_def
==
nullptr
)
{
return
MaceStatus
::
MACE_INVALID_ARGS
;
}
if
(
!
runtime
->
IsImageSupport
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
auto
opencl_max_image_size
=
runtime
->
GetMaxImage2DSize
();
if
(
opencl_max_image_size
.
empty
())
{
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
const
std
::
vector
<
int64_t
>
net_max_image_size
=
ProtoArgHelper
::
GetRepeatedArgs
<
NetDef
,
int64_t
>
(
*
net_def
,
"opencl_max_image_size"
,
{
0
,
0
});
if
(
static_cast
<
uint64_t
>
(
net_max_image_size
[
0
])
>
opencl_max_image_size
[
0
]
||
static_cast
<
uint64_t
>
(
net_max_image_size
[
1
])
>
opencl_max_image_size
[
1
])
{
LOG
(
INFO
)
<<
"opencl max image size "
<<
MakeString
(
opencl_max_image_size
)
<<
" vs "
<<
MakeString
(
net_max_image_size
);
return
MaceStatus
::
MACE_OUT_OF_RESOURCES
;
}
return
MaceStatus
::
MACE_SUCCESS
;
}
#endif
}
// namespace
}
// namespace
// Mace Tensor
// Mace Tensor
...
@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
...
@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
const
std
::
vector
<
std
::
string
>
&
output_nodes
,
const
std
::
vector
<
std
::
string
>
&
output_nodes
,
const
unsigned
char
*
model_data
)
{
const
unsigned
char
*
model_data
)
{
LOG
(
INFO
)
<<
"Initializing MaceEngine"
;
LOG
(
INFO
)
<<
"Initializing MaceEngine"
;
// Check avalibility
#ifdef MACE_ENABLE_OPENCL
if
(
device_type_
==
DeviceType
::
GPU
)
{
MACE_RETURN_IF_ERROR
(
CheckGPUAvalibility
(
net_def
));
}
#endif
// Get input and output information.
// Get input and output information.
for
(
auto
&
input_info
:
net_def
->
input_info
())
{
for
(
auto
&
input_info
:
net_def
->
input_info
())
{
input_info_map_
[
input_info
.
name
()]
=
input_info
;
input_info_map_
[
input_info
.
name
()]
=
input_info
;
...
...
mace/proto/mace.proto
浏览文件 @
88120708
...
@@ -20,6 +20,12 @@ enum DataType {
...
@@ -20,6 +20,12 @@ enum DataType {
DT_INT32
=
4
;
DT_INT32
=
4
;
}
}
enum
MemoryType
{
CPU_BUFFER
=
0
;
GPU_BUFFER
=
1
;
GPU_IMAGE
=
2
;
}
message
ConstTensor
{
message
ConstTensor
{
repeated
int64
dims
=
1
;
repeated
int64
dims
=
1
;
optional
DataType
data_type
=
2
[
default
=
DT_FLOAT
];
optional
DataType
data_type
=
2
[
default
=
DT_FLOAT
];
...
@@ -73,8 +79,9 @@ message OperatorDef {
...
@@ -73,8 +79,9 @@ message OperatorDef {
// for memory optimization
// for memory optimization
message
MemoryBlock
{
message
MemoryBlock
{
optional
int32
mem_id
=
1
;
optional
int32
mem_id
=
1
;
optional
uint32
x
=
2
;
optional
MemoryType
mem_type
=
2
;
optional
uint32
y
=
3
;
optional
uint32
x
=
3
;
optional
uint32
y
=
4
;
}
}
message
MemoryArena
{
message
MemoryArena
{
repeated
MemoryBlock
mem_block
=
1
;
repeated
MemoryBlock
mem_block
=
1
;
...
...
mace/public/mace_runtime.h
浏览文件 @
88120708
...
@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
...
@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
std
::
unique_ptr
<
Impl
>
impl_
;
std
::
unique_ptr
<
Impl
>
impl_
;
};
};
// Set Key-Value store factory. (Call Once)
/// \brief Set internal storage factory to store internal data. (Call once)
// Now KVStorage is used to store the built OpenCL binaries to file,
///
// which could speed up the GPU initialization and first run.
/// Now the path is used to store the built OpenCL binaries to file,
// If do not call this API, the initialization maybe slow for GPU.
/// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
void
SetKVStorageFactory
(
std
::
shared_ptr
<
KVStorageFactory
>
storage_factory
);
void
SetKVStorageFactory
(
std
::
shared_ptr
<
KVStorageFactory
>
storage_factory
);
// Just call once. (Not thread-safe)
/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)
///
// if you use gpu of specific soc.
/// Just call once. (Not thread-safe)
// Using OpenCL binary will speed up the initialization.
/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
// OpenCL binary is corresponding to the OpenCL Driver version,
/// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed.
/// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
void
SetOpenCLBinaryPaths
(
const
std
::
vector
<
std
::
string
>
&
paths
);
void
SetOpenCLBinaryPaths
(
const
std
::
vector
<
std
::
string
>
&
paths
);
// Just call once. (Not thread-safe)
/// \brief Set the path of Generated OpenCL parameter file
// Set the path of Generated OpenCL parameter file
///
// if you use gpu for specific soc.
/// Just call once. (Not thread-safe)
// The parameters is the local work group size tuned for specific SOC, which
/// If you use gpu for specific soc, The parameters is the local work group
// may be faster than the general parameters.
/// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
void
SetOpenCLParameterPath
(
const
std
::
string
&
path
);
void
SetOpenCLParameterPath
(
const
std
::
string
&
path
);
// Set GPU hints, currently only supports Adreno GPU.
/// \brief Set GPU hints, currently only supports Adreno GPU.
//
///
// Caution: this function may hurt performance if improper parameters provided.
/// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
void
SetGPUHints
(
GPUPerfHint
perf_hint
,
GPUPriorityHint
priority_hint
);
void
SetGPUHints
(
GPUPerfHint
perf_hint
,
GPUPriorityHint
priority_hint
);
// Set OpenMP threads number and affinity policy.
/// \brief Set OpenMP threads number and affinity policy.
//
///
// Caution: this function may hurt performance if improper parameters provided.
/// Caution: this function may hurt performance if improper parameters provided.
//
/// When num_threads_hint is zero or negative,
// num_threads_hint is only a hint. When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
//
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
// The OpenMP threads will be bind to (via sched_setaffinity) big cores
///
// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
/// \param num_threads_hint it is only a hint.
//
/// \param policy one of CPUAffinityPolicy
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
/// \param status MACE_SUCCESS for successful, or it can't reliabley
// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
// suggested to use AFFINITY_NONE to use all cores.
/// suggested to use AFFINITY_NONE to use all cores.
/// \return
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
MaceStatus
SetOpenMPThreadPolicy
(
int
num_threads_hint
,
MaceStatus
SetOpenMPThreadPolicy
(
int
num_threads_hint
,
CPUAffinityPolicy
policy
);
CPUAffinityPolicy
policy
);
// Set OpenMP threads number and processor affinity.
/// \brief Set OpenMP threads number and processor affinity.
//
///
// Caution: this function may hurt performance if improper parameters provided.
/// Caution: this function may hurt performance
//
/// if improper parameters provided.
// This function may not work well on some chips (e.g. MTK). Setting thread
/// This function may not work well on some chips (e.g. MTK). Setting thread
// affinity to offline cores may run very slow or unexpectedly. In such cases,
/// affinity to offline cores may run very slow or unexpectedly.
// please use SetOpenMPThreadPolicy with default policy instead.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \param status
/// \return
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
MaceStatus
SetOpenMPThreadAffinity
(
int
num_threads
,
MaceStatus
SetOpenMPThreadAffinity
(
int
num_threads
,
const
std
::
vector
<
int
>
&
cpu_ids
);
const
std
::
vector
<
int
>
&
cpu_ids
);
// Get ARM big.LITTLE configuration.
/// \brief Get ARM big.LITTLE configuration.
//
///
// This function will detect the max frequencies of all CPU cores, and assume
/// This function will detect the max frequencies of all CPU cores, and assume
// the cores with largest max frequencies as big cores, and all the remaining
/// the cores with largest max frequencies as big cores, and all the remaining
// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
// little_core_ids will both be filled with all cpu core ids.
/// little_core_ids will both be filled with all cpu core ids.
//
///
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
/// \param [out] big_core_ids
// detect the frequency of big-LITTLE cores (e.g. MTK).
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
__attribute__
((
visibility
(
"default"
)))
__attribute__
((
visibility
(
"default"
)))
MaceStatus
GetBigLittleCoreIDs
(
std
::
vector
<
int
>
*
big_core_ids
,
MaceStatus
GetBigLittleCoreIDs
(
std
::
vector
<
int
>
*
big_core_ids
,
std
::
vector
<
int
>
*
little_core_ids
);
std
::
vector
<
int
>
*
little_core_ids
);
...
...
mace/python/tools/convert_util.py
浏览文件 @
88120708
...
@@ -12,7 +12,72 @@
...
@@ -12,7 +12,72 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
enum
def
mace_check
(
condition
,
msg
):
def
mace_check
(
condition
,
msg
):
if
not
condition
:
if
not
condition
:
raise
Exception
(
msg
)
raise
Exception
(
msg
)
def
roundup_div4
(
value
):
return
int
((
value
+
3
)
/
4
)
class
OpenCLBufferType
(
enum
.
Enum
):
CONV2D_FILTER
=
0
IN_OUT_CHANNEL
=
1
ARGUMENT
=
2
IN_OUT_HEIGHT
=
3
IN_OUT_WIDTH
=
4
WINOGRAD_FILTER
=
5
DW_CONV2D_FILTER
=
6
WEIGHT_HEIGHT
=
7
WEIGHT_WIDTH
=
8
def
calculate_image_shape
(
buffer_type
,
shape
,
winograd_blk_size
=
0
):
# keep the same with mace/kernel/opencl/helper.cc
image_shape
=
[
0
,
0
]
if
buffer_type
==
OpenCLBufferType
.
CONV2D_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Conv2D filter buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
1
]
image_shape
[
1
]
=
shape
[
2
]
*
shape
[
3
]
*
roundup_div4
(
shape
[
0
])
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_CHANNEL
:
mace_check
(
len
(
shape
)
==
4
,
"Conv2D input/output buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
3
])
*
shape
[
2
]
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
]
elif
buffer_type
==
OpenCLBufferType
.
ARGUMENT
:
mace_check
(
len
(
shape
)
==
1
,
"Argument buffer should be 1D not "
+
str
(
shape
))
image_shape
[
0
]
=
roundup_div4
(
shape
[
0
])
image_shape
[
1
]
=
1
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_HEIGHT
:
mace_check
(
len
(
shape
)
==
4
,
"Input/output buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
*
roundup_div4
(
shape
[
1
])
elif
buffer_type
==
OpenCLBufferType
.
IN_OUT_WIDTH
:
mace_check
(
len
(
shape
)
==
4
,
"Input/output buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
2
])
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
]
elif
buffer_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Winograd filter buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
1
])
image_shape
[
1
]
=
(
shape
[
0
]
*
(
winograd_blk_size
+
2
)
*
(
winograd_blk_size
+
2
))
elif
buffer_type
==
OpenCLBufferType
.
DW_CONV2D_FILTER
:
mace_check
(
len
(
shape
)
==
4
,
"Winograd filter buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
roundup_div4
(
shape
[
1
])
elif
buffer_type
==
OpenCLBufferType
.
WEIGHT_HEIGHT
:
mace_check
(
len
(
shape
)
==
4
,
"Weight buffer should be 4D"
)
image_shape
[
0
]
=
shape
[
1
]
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
roundup_div4
(
shape
[
0
])
elif
buffer_type
==
OpenCLBufferType
.
WEIGHT_WIDTH
:
mace_check
(
len
(
shape
)
==
4
,
"Weight buffer should be 4D"
)
image_shape
[
0
]
=
roundup_div4
(
shape
[
1
])
*
shape
[
2
]
*
shape
[
3
]
image_shape
[
1
]
=
shape
[
0
]
else
:
mace_check
(
False
,
"OpenCL Image do not support type "
+
str
(
buffer_type
))
return
image_shape
mace/python/tools/converter.py
浏览文件 @
88120708
...
@@ -171,6 +171,13 @@ def main(unused_args):
...
@@ -171,6 +171,13 @@ def main(unused_args):
output_graph_def
.
op
.
extend
(
cpu_graph_def
.
op
)
output_graph_def
.
op
.
extend
(
cpu_graph_def
.
op
)
output_graph_def
.
mem_arena
.
mem_block
.
extend
(
output_graph_def
.
mem_arena
.
mem_block
.
extend
(
cpu_graph_def
.
mem_arena
.
mem_block
)
cpu_graph_def
.
mem_arena
.
mem_block
)
output_graph_arg_names
=
set
()
for
arg
in
output_graph_def
.
arg
:
output_graph_arg_names
.
add
(
arg
.
name
)
for
arg
in
cpu_graph_def
.
arg
:
if
arg
.
name
not
in
output_graph_arg_names
:
output_graph_def
.
arg
.
extend
(
arg
)
print
"Merge done"
print
"Merge done"
else
:
else
:
option
.
device
=
device_type_map
[
FLAGS
.
runtime
]
option
.
device
=
device_type_map
[
FLAGS
.
runtime
]
...
...
mace/python/tools/converter_tool/base_converter.py
浏览文件 @
88120708
...
@@ -163,6 +163,7 @@ class MaceKeyword(object):
...
@@ -163,6 +163,7 @@ class MaceKeyword(object):
mace_op_data_type_str
=
'T'
mace_op_data_type_str
=
'T'
mace_offset_str
=
'offset'
mace_offset_str
=
'offset'
mace_from_caffe_str
=
'from_caffe'
mace_from_caffe_str
=
'from_caffe'
mace_opencl_max_image_size
=
"opencl_max_image_size"
class
TransformerRule
(
Enum
):
class
TransformerRule
(
Enum
):
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
88120708
...
@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
...
@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
from
mace.python.tools.converter_tool.base_converter
import
MaceOp
from
mace.python.tools.converter_tool.base_converter
import
MaceOp
from
mace.python.tools.converter_tool.base_converter
import
PaddingMode
from
mace.python.tools.converter_tool.base_converter
import
PaddingMode
from
mace.python.tools.converter_tool.base_converter
import
TransformerRule
from
mace.python.tools.converter_tool.base_converter
import
TransformerRule
from
mace.python.tools.convert_util
import
calculate_image_shape
from
mace.python.tools.convert_util
import
mace_check
from
mace.python.tools.convert_util
import
mace_check
from
mace.python.tools.convert_util
import
OpenCLBufferType
OPENCL_IMAGE_MAX_SIZE
=
16384
class
OpenCLBufferType
(
enum
.
Enum
):
OPENCL_IMAGE_MAX_SIZE
=
16384
CONV2D_FILTER
=
0
IN_OUT_CHANNEL
=
1
ARGUMENT
=
2
IN_OUT_HEIGHT
=
3
IN_OUT_WIDTH
=
4
WINOGRAD_FILTER
=
5
DW_CONV2D_FILTER
=
6
WEIGHT_HEIGHT
=
7
WEIGHT_WIDTH
=
8
class
Transformer
(
base_converter
.
ConverterInterface
):
class
Transformer
(
base_converter
.
ConverterInterface
):
...
@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
...
@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
self
.
_producer
=
{}
self
.
_producer
=
{}
self
.
_target_data_format
=
DataFormat
.
NHWC
self
.
_target_data_format
=
DataFormat
.
NHWC
self
.
_input_output_added
=
False
self
.
_input_output_added
=
False
self
.
_opencl_max_image_size
=
[
0
,
0
]
if
self
.
_option
.
device
==
DeviceType
.
CPU
.
value
:
if
self
.
_option
.
device
==
DeviceType
.
CPU
.
value
:
self
.
_target_data_format
=
DataFormat
.
NCHW
self
.
_target_data_format
=
DataFormat
.
NCHW
...
@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
...
@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
arg
.
name
=
MaceKeyword
.
mace_mode
arg
.
name
=
MaceKeyword
.
mace_mode
arg
.
i
=
0
arg
.
i
=
0
tensor_shape
=
list
(
self
.
_consts
[
input_name
].
dims
)
if
input_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
if
input_type
==
OpenCLBufferType
.
WINOGRAD_FILTER
:
blk_sqr
=
op
.
output_shape
[
0
].
dims
[
0
]
blk_sqr
=
op
.
output_shape
[
0
].
dims
[
0
]
wino_blk
=
int
(
np
.
sqrt
(
blk_sqr
))
-
2
wino_blk
=
int
(
np
.
sqrt
(
blk_sqr
))
-
2
wino_arg
=
op_def
.
arg
.
add
()
wino_arg
=
op_def
.
arg
.
add
()
wino_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
wino_arg
.
name
=
MaceKeyword
.
mace_wino_block_size
wino_arg
.
i
=
wino_blk
wino_arg
.
i
=
wino_blk
img_shape
=
calculate_image_shape
(
input_type
,
tensor_shape
,
wino_blk
)
else
:
img_shape
=
calculate_image_shape
(
input_type
,
tensor_shape
)
op
.
input
[
input_idx
]
=
output_name
op
.
input
[
input_idx
]
=
output_name
# update OpenCL max image size
self
.
_opencl_max_image_size
[
0
]
=
max
(
self
.
_opencl_max_image_size
[
0
],
img_shape
[
0
])
self
.
_opencl_max_image_size
[
1
]
=
max
(
self
.
_opencl_max_image_size
[
1
],
img_shape
[
1
])
def
transform_buffer_image
(
self
):
def
transform_buffer_image
(
self
):
if
self
.
_option
.
device
!=
DeviceType
.
GPU
.
value
:
if
self
.
_option
.
device
!=
DeviceType
.
GPU
.
value
:
return
False
return
False
...
@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
...
@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
MaceKeyword
.
mace_activation_type_str
).
s
==
ActivationType
.
PRELU
.
name
:
# noqa
MaceKeyword
.
mace_activation_type_str
).
s
==
ActivationType
.
PRELU
.
name
:
# noqa
self
.
buffer_to_image
(
op
,
1
,
OpenCLBufferType
.
ARGUMENT
)
self
.
buffer_to_image
(
op
,
1
,
OpenCLBufferType
.
ARGUMENT
)
# Add OpenCL max image size
arg
=
net
.
arg
.
add
()
arg
.
name
=
MaceKeyword
.
mace_opencl_max_image_size
arg
.
ints
.
extend
(
self
.
_opencl_max_image_size
)
for
input_node
in
self
.
_option
.
input_nodes
.
values
():
for
input_node
in
self
.
_option
.
input_nodes
.
values
():
new_input_name
=
MaceKeyword
.
mace_input_node_name
\
new_input_name
=
MaceKeyword
.
mace_input_node_name
\
+
'_'
+
input_node
.
name
+
'_'
+
input_node
.
name
...
...
mace/python/tools/memory_optimizer.py
浏览文件 @
88120708
...
@@ -16,6 +16,24 @@ import sys
...
@@ -16,6 +16,24 @@ import sys
import
operator
import
operator
from
mace.proto
import
mace_pb2
from
mace.proto
import
mace_pb2
from
mace.python.tools.converter_tool
import
base_converter
as
cvt
from
mace.python.tools.convert_util
import
calculate_image_shape
from
mace.python.tools.convert_util
import
OpenCLBufferType
class
MemoryBlock
(
object
):
def
__init__
(
self
,
mem_type
,
block
):
self
.
_mem_type
=
mem_type
self
.
_block
=
block
@
property
def
mem_type
(
self
):
return
self
.
_mem_type
@
property
def
block
(
self
):
return
self
.
_block
class
MemoryOptimizer
(
object
):
class
MemoryOptimizer
(
object
):
def
__init__
(
self
,
net_def
):
def
__init__
(
self
,
net_def
):
...
@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
...
@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
self
.
op_mem
=
{}
# op_name->mem_id
self
.
op_mem
=
{}
# op_name->mem_id
self
.
mem_block
=
{}
# mem_id->[size] or mem_id->[x, y]
self
.
mem_block
=
{}
# mem_id->[size] or mem_id->[x, y]
self
.
total_mem_count
=
0
self
.
total_mem_count
=
0
self
.
total_cpu_mem_count
=
0
self
.
input_ref_counter
=
{}
self
.
input_ref_counter
=
{}
self
.
mem_ref_counter
=
{}
self
.
mem_ref_counter
=
{}
...
@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
...
@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
return
True
return
True
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
return
[
reduce
(
operator
.
mul
,
output_shape
,
1
)]
return
MemoryBlock
(
mace_pb2
.
CPU_BUFFER
,
[
reduce
(
operator
.
mul
,
output_shape
,
1
)])
def
mem_size
(
self
,
memory_block
):
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
return
memory_block
.
block
[
0
]
def
sub_mem_block
(
self
,
mem_block1
,
mem_block2
):
def
sub_mem_block
(
self
,
mem_block1
,
mem_block2
):
return
self
.
mem_size
(
mem_block1
)
-
self
.
mem_size
(
mem_block2
)
return
self
.
mem_size
(
mem_block1
)
-
self
.
mem_size
(
mem_block2
)
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
return
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
])]
return
MemoryBlock
(
old_mem_block
.
mem_type
,
[
max
(
old_mem_block
.
block
[
0
],
op_mem_block
.
block
[
0
])])
def
add_net_mem_blocks
(
self
):
def
add_net_mem_blocks
(
self
):
for
mem
in
self
.
mem_block
:
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
mem_type
=
self
.
mem_block
[
mem
].
mem_type
block
.
x
=
self
.
mem_block
[
mem
].
block
[
0
]
block
.
y
=
1
block
.
y
=
1
def
get_total_origin_mem_size
(
self
):
def
get_total_origin_mem_size
(
self
):
...
@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
...
@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
def
get_total_optimized_mem_size
(
self
):
def
get_total_optimized_mem_size
(
self
):
optimized_mem_size
=
0
optimized_mem_size
=
0
for
mem
in
self
.
mem_block
:
for
mem
in
self
.
mem_block
:
print
mem
,
self
.
mem_block
[
mem
]
print
mem
,
self
.
mem_block
[
mem
]
.
mem_type
,
self
.
mem_block
[
mem
].
block
optimized_mem_size
+=
self
.
mem_size
(
self
.
mem_block
[
mem
])
optimized_mem_size
+=
self
.
mem_size
(
self
.
mem_block
[
mem
])
return
optimized_mem_size
return
optimized_mem_size
...
@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
...
@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
best_mem_waste_size
=
sys
.
maxint
best_mem_waste_size
=
sys
.
maxint
for
mid
in
self
.
idle_mem
:
for
mid
in
self
.
idle_mem
:
old_mem_block
=
self
.
mem_block
[
mid
]
old_mem_block
=
self
.
mem_block
[
mid
]
if
old_mem_block
.
mem_type
!=
op_mem_block
.
mem_type
:
continue
new_mem_block
=
self
.
resize_mem_block
(
new_mem_block
=
self
.
resize_mem_block
(
old_mem_block
,
op_mem_block
)
old_mem_block
,
op_mem_block
)
add_mem_size
=
self
.
sub_mem_block
(
new_mem_block
,
add_mem_size
=
self
.
sub_mem_block
(
new_mem_block
,
...
@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
...
@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
for
arg
in
op
.
arg
:
for
arg
in
op
.
arg
:
if
arg
.
name
==
'mode'
and
arg
.
i
==
0
:
if
arg
.
name
==
'mode'
and
arg
.
i
==
0
:
return
False
return
False
elif
op
.
type
==
'Shape'
:
for
i
in
range
(
len
(
op
.
output
)):
mem_id
=
self
.
total_cpu_mem_count
self
.
total_cpu_mem_count
+=
1
op_mem_block
=
self
.
get_op_mem_block
(
op
.
type
,
op
.
output_shape
[
i
].
dims
)
self
.
mem_block
[
mem_id
]
=
op_mem_block
return
False
return
op
.
type
!=
'ImageToBuffer'
return
op
.
type
!=
'ImageToBuffer'
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
mem_block
=
[
0
,
0
]
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
mem_block
[
0
]
=
output_shape
[
2
]
buffer_shape
=
list
(
output_shape
)
+
[
1
]
mem_block
[
1
]
=
output_shape
[
0
]
*
int
((
output_shape
[
1
]
+
3
)
/
4
)
mem_block
=
MemoryBlock
(
mace_pb2
.
GPU_IMAGE
,
calculate_image_shape
(
OpenCLBufferType
.
IN_OUT_HEIGHT
,
buffer_shape
))
elif
op_type
==
'Shape'
:
elif
op_type
==
'Shape'
:
mem_block
[
0
]
=
output_shape
[
0
]
mem_block
=
MemoryBlock
(
mace_pb2
.
CPU_BUFFER
,
mem_block
[
1
]
=
1
[
output_shape
[
0
],
1
])
else
:
else
:
if
len
(
output_shape
)
==
2
:
# only support fc/softmax
if
len
(
output_shape
)
==
2
:
# only support fc/softmax
mem_block
[
0
]
=
int
((
output_shape
[
1
]
+
3
)
/
4
)
buffer_shape
=
[
output_shape
[
0
],
1
,
1
,
output_shape
[
1
]]
mem_block
[
1
]
=
output_shape
[
0
]
elif
len
(
output_shape
)
==
4
:
elif
len
(
output_shape
)
==
4
:
mem_block
[
0
]
=
output_shape
[
2
]
*
int
((
output_shape
[
3
]
+
3
)
/
4
)
buffer_shape
=
output_shape
mem_block
[
1
]
=
output_shape
[
0
]
*
output_shape
[
1
]
else
:
else
:
raise
Exception
(
'output shape dim size is not 2 or 4.'
)
raise
Exception
(
'output shape dim size is not 2 or 4.'
)
mem_block
=
MemoryBlock
(
mace_pb2
.
GPU_IMAGE
,
calculate_image_shape
(
OpenCLBufferType
.
IN_OUT_CHANNEL
,
buffer_shape
))
return
mem_block
return
mem_block
def
mem_size
(
self
,
memory_block
):
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
*
memory_block
[
1
]
*
4
if
memory_block
.
mem_type
==
mace_pb2
.
GPU_IMAGE
:
return
memory_block
.
block
[
0
]
*
memory_block
.
block
[
1
]
*
4
else
:
return
memory_block
.
block
[
0
]
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
resize_mem_block
=
[
resize_mem_block
=
MemoryBlock
(
max
(
old_mem_block
[
0
],
op_mem_block
[
0
]),
old_mem_block
.
mem_type
,
max
(
old_mem_block
[
1
],
op_mem_block
[
1
])
[
]
max
(
old_mem_block
.
block
[
0
],
op_mem_block
.
block
[
0
]),
max
(
old_mem_block
.
block
[
1
],
op_mem_block
.
block
[
1
])
])
return
resize_mem_block
return
resize_mem_block
def
add_net_mem_blocks
(
self
):
def
add_net_mem_blocks
(
self
):
max_image_size_x
=
0
max_image_size_y
=
0
for
mem
in
self
.
mem_block
:
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
mem_type
=
self
.
mem_block
[
mem
].
mem_type
block
.
y
=
self
.
mem_block
[
mem
][
1
]
block
.
x
=
self
.
mem_block
[
mem
].
block
[
0
]
block
.
y
=
self
.
mem_block
[
mem
].
block
[
1
]
if
self
.
mem_block
[
mem
].
mem_type
==
mace_pb2
.
GPU_IMAGE
:
max_image_size_x
=
max
(
max_image_size_x
,
block
.
x
)
max_image_size_y
=
max
(
max_image_size_y
,
block
.
y
)
# Update OpenCL max image size
net_ocl_max_img_size_arg
=
None
for
arg
in
self
.
net_def
.
arg
:
if
arg
.
name
==
cvt
.
MaceKeyword
.
mace_opencl_max_image_size
:
net_ocl_max_img_size_arg
=
arg
max_image_size_x
=
max
(
arg
.
ints
[
0
],
max_image_size_x
)
max_image_size_y
=
max
(
arg
.
ints
[
1
],
max_image_size_y
)
break
if
net_ocl_max_img_size_arg
is
None
:
net_ocl_max_img_size_arg
=
self
.
net_def
.
arg
.
add
()
net_ocl_max_img_size_arg
.
name
=
\
cvt
.
MaceKeyword
.
mace_opencl_max_image_size
net_ocl_max_img_size_arg
.
ints
[:]
=
[
max_image_size_x
,
max_image_size_y
]
def
mem_id_base
(
self
):
def
mem_id_base
(
self
):
return
20000
return
20000
...
...
mace/python/tools/model.jinja2
浏览文件 @
88120708
...
@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
...
@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
...
...
mace/test/mace_api_mt_test.cc
浏览文件 @
88120708
...
@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
...
@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
res
[
input_names
[
i
]]
=
mem_id
;
res
[
input_names
[
i
]]
=
mem_id
;
...
@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
...
@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
res
[
output_names
[
i
]]
=
mem_id
;
res
[
output_names
[
i
]]
=
mem_id
;
...
...
mace/test/mace_api_test.cc
浏览文件 @
88120708
...
@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
...
@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
input_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_x
(
in_mem_block_x
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
mem_blk_ptr
->
set_y
(
in_mem_block_y
);
res
[
input_names
[
i
]]
=
mem_id
;
res
[
input_names
[
i
]]
=
mem_id
;
...
@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
...
@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
output_size
;
++
i
)
{
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
MemoryBlock
*
mem_blk_ptr
=
mem_arena_ptr
->
add_mem_block
();
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_id
(
mem_id
);
mem_blk_ptr
->
set_mem_type
(
MemoryType
::
GPU_IMAGE
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_x
(
out_mem_block_x
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
mem_blk_ptr
->
set_y
(
out_mem_block_y
);
res
[
output_names
[
i
]]
=
mem_id
;
res
[
output_names
[
i
]]
=
mem_id
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录