Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
项目经理老王
Mace
提交
04b8524e
Mace
项目概览
项目经理老王
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
04b8524e
编写于
12月 27, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Temporary solution for cpu/gpu runtime fallback. Fix several issues
上级
aea5e30a
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
137 addition
and
23 deletion
+137
-23
mace/core/net.cc
mace/core/net.cc
+93
-3
mace/core/net.h
mace/core/net.h
+2
-0
mace/core/workspace.cc
mace/core/workspace.cc
+34
-10
mace/libmace/mace.cc
mace/libmace/mace.cc
+4
-4
mace/ops/channel_shuffle.cc
mace/ops/channel_shuffle.cc
+4
-6
未找到文件。
mace/core/net.cc
浏览文件 @
04b8524e
...
...
@@ -14,6 +14,7 @@
#include <algorithm>
#include <limits>
#include <set>
#include <unordered_set>
#include <utility>
...
...
@@ -63,18 +64,85 @@ bool TransformRequiredOp(const std::string &op_type) {
}
#endif // MACE_ENABLE_OPENCL
// TODO(lichao): Move to runtime driver class after universality done.
// fallback to gpu buffer when kernels are implemented
void
FindAvailableDevicesForOp
(
const
OpRegistryBase
&
op_registry
,
const
OperatorDef
&
op
,
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
index_t
>>
&
tensor_shape_info
,
std
::
set
<
DeviceType
>
*
available_devices
)
{
auto
devices
=
op_registry
.
AvailableDevices
(
op
.
type
());
available_devices
->
insert
(
devices
.
begin
(),
devices
.
end
());
std
::
string
op_type
=
op
.
type
();
// For those whose shape is not 4-rank but can run on GPU
if
(
op_type
==
"BufferTransform"
||
op_type
==
"LSTMCell"
||
op_type
==
"FullyConnected"
||
op_type
==
"Softmax"
||
op_type
==
"Squeeze"
)
{
return
;
}
else
{
if
(
op
.
output_shape_size
()
!=
op
.
output_size
())
{
return
;
}
if
(
op
.
output_shape
(
0
).
dims_size
()
!=
4
)
{
available_devices
->
erase
(
DeviceType
::
GPU
);
}
if
(
op_type
==
"Split"
)
{
if
(
op
.
output_shape
(
0
).
dims_size
()
!=
4
||
op
.
output_shape
(
0
).
dims
()[
3
]
%
4
!=
0
)
{
available_devices
->
erase
(
DeviceType
::
GPU
);
}
}
else
if
(
op_type
==
"Concat"
)
{
if
(
op
.
output_shape
(
0
).
dims_size
()
!=
4
)
{
available_devices
->
erase
(
DeviceType
::
GPU
);
}
else
{
if
(
op
.
input_size
()
!=
2
)
{
for
(
const
std
::
string
&
input
:
op
.
input
())
{
if
(
tensor_shape_info
.
find
(
input
)
!=
tensor_shape_info
.
end
())
{
auto
&
input_shape
=
tensor_shape_info
.
at
(
input
);
if
(
input_shape
[
3
]
%
4
!=
0
)
{
available_devices
->
erase
(
DeviceType
::
GPU
);
break
;
}
}
}
}
}
}
else
if
(
op_type
==
"ChannelShuffle"
)
{
int
groups
=
ProtoArgHelper
::
GetOptionalArg
<
OperatorDef
,
int
>
(
op
,
"group"
,
1
);
int
channels
=
op
.
output_shape
(
0
).
dims
(
3
);
int
channels_per_group
=
channels
/
groups
;
if
(
groups
%
4
!=
0
||
channels_per_group
%
4
!=
0
)
{
available_devices
->
erase
(
DeviceType
::
GPU
);
}
}
}
}
}
// namespace
std
::
unique_ptr
<
Operation
>
SerialNet
::
CreateOperation
(
const
OpRegistryBase
*
op_registry
,
OpConstructContext
*
construct_context
,
std
::
shared_ptr
<
OperatorDef
>
op_def
,
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
index_t
>>
tensor_shape_info
,
DataFormat
data_format_flag
,
bool
is_quantize_model
)
{
// Create the Operation
DeviceType
target_device_type
=
target_device_
->
device_type
();
// Get available devices
auto
available_devices
=
op_registry
->
AvailableDevices
(
op_def
->
type
());
std
::
set
<
DeviceType
>
available_devices
;
FindAvailableDevicesForOp
(
*
op_registry
,
*
op_def
,
tensor_shape_info
,
&
available_devices
);
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
...
...
@@ -93,6 +161,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
}
}
op_def
->
set_device_type
(
device_type
);
// transpose output shape if run on CPU (default format is NHWC)
if
(
!
is_quantize_model
&&
device_type
==
DeviceType
::
CPU
&&
op_def
->
output_shape_size
()
==
op_def
->
output_size
())
{
...
...
@@ -139,7 +208,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
MemoryType
target_mem_type
;
// quantize model flag
bool
is_quantize_model
=
IsQuantizedModel
(
*
net_def
);
//
DataFormat
data_format_flag
=
NHWC
;
if
(
target_device_
->
device_type
()
==
DeviceType
::
CPU
)
{
target_mem_type
=
MemoryType
::
CPU_BUFFER
;
...
...
@@ -163,6 +232,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_mem_type
,
DataType
::
DT_FLOAT
,
input_shape
,
-
1
));
}
}
#ifdef MACE_ENABLE_OPENCL
else
{
// GPU NOLINT[readability/braces]
target_mem_type
=
MemoryType
::
GPU_BUFFER
;
...
...
@@ -176,6 +246,22 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
}
#endif // MACE_ENABLE_OPENCL
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
index_t
>>
tensor_shape_info
;
for
(
auto
&
op
:
net_def
->
op
())
{
if
(
op
.
output_size
()
!=
op
.
output_shape_size
())
{
continue
;
}
for
(
int
i
=
0
;
i
<
op
.
output_size
();
++
i
)
{
tensor_shape_info
[
op
.
output
(
i
)]
=
std
::
move
(
std
::
vector
<
index_t
>
(
op
.
output_shape
(
i
).
dims
().
begin
(),
op
.
output_shape
(
i
).
dims
().
end
()));
}
}
for
(
auto
&
tensor
:
net_def
->
tensors
())
{
tensor_shape_info
[
tensor
.
name
()]
=
std
::
move
(
std
::
vector
<
index_t
>
(
tensor
.
dims
().
begin
(),
tensor
.
dims
().
end
()));
}
OpConstructContext
construct_context
(
ws_
);
for
(
int
idx
=
0
;
idx
<
net_def
->
op_size
();
++
idx
)
{
std
::
shared_ptr
<
OperatorDef
>
op_def
(
new
OperatorDef
(
net_def
->
op
(
idx
)));
...
...
@@ -183,6 +269,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
auto
op
=
CreateOperation
(
op_registry
,
&
construct_context
,
op_def
,
tensor_shape_info
,
data_format_flag
,
is_quantize_model
);
#ifdef MACE_ENABLE_OPENCL
...
...
@@ -211,7 +298,8 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
auto
&
output_info
=
output_map
.
at
(
op_def
->
input
(
i
));
// check whether the tensor has been transformed
if
(
transformed_set
.
count
(
t_input_name
)
==
0
)
{
VLOG
(
1
)
<<
"Add Transform operation to transform tensor '"
VLOG
(
1
)
<<
"Add Transform operation "
<<
op_def
->
name
()
<<
" to transform tensor "
<<
op_def
->
input
(
i
)
<<
"', from memory type "
<<
output_info
.
mem_type
<<
" to "
<<
wanted_in_mem_type
...
...
@@ -234,6 +322,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
op_registry
,
&
construct_context
,
transform_op_def
,
tensor_shape_info
,
data_format_flag
);
operators_
.
emplace_back
(
std
::
move
(
transform_op
));
transformed_set
.
insert
(
t_input_name
);
...
...
@@ -321,6 +410,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
op_registry
,
&
construct_context
,
transform_op_def
,
tensor_shape_info
,
output_data_format
);
operators_
.
emplace_back
(
std
::
move
(
transform_op
));
// where to do graph reference count.
...
...
mace/core/net.h
浏览文件 @
04b8524e
...
...
@@ -59,6 +59,8 @@ class SerialNet : public NetBase {
const
OpRegistryBase
*
op_registry
,
OpConstructContext
*
construct_context
,
std
::
shared_ptr
<
OperatorDef
>
op_def
,
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
index_t
>>
tensor_shape_info
,
DataFormat
input_format
,
bool
is_quantize_model
=
false
);
...
...
mace/core/workspace.cc
浏览文件 @
04b8524e
...
...
@@ -330,6 +330,14 @@ void Workspace::RemoveUnusedBuffer() {
void
Workspace
::
RemoveAndReloadBuffer
(
const
NetDef
&
net_def
,
const
unsigned
char
*
model_data
,
Allocator
*
alloc
)
{
std
::
unordered_set
<
std
::
string
>
tensor_to_host
;
for
(
auto
&
op
:
net_def
.
op
())
{
if
(
op
.
device_type
()
==
DeviceType
::
CPU
)
{
for
(
std
::
string
input
:
op
.
input
())
{
tensor_to_host
.
insert
(
input
);
}
}
}
for
(
auto
&
const_tensor
:
net_def
.
tensors
())
{
auto
iter
=
tensor_map_
.
find
(
const_tensor
.
name
());
if
(
iter
->
second
->
unused
())
{
...
...
@@ -340,16 +348,32 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
for
(
const
index_t
d
:
const_tensor
.
dims
())
{
dims
.
push_back
(
d
);
}
std
::
unique_ptr
<
Tensor
>
tensor
(
new
Tensor
(
alloc
,
const_tensor
.
data_type
()));
tensor
->
Resize
(
dims
);
MACE_CHECK
(
tensor
->
size
()
==
const_tensor
.
data_size
(),
"Tensor's data_size not equal with the shape"
);
tensor
->
CopyBytes
(
model_data
+
const_tensor
.
offset
(),
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
()));
tensor_map_
[
const_tensor
.
name
()]
=
std
::
move
(
tensor
);
if
(
tensor_to_host
.
find
(
const_tensor
.
name
())
!=
tensor_to_host
.
end
())
{
DataType
host_data_type
=
const_tensor
.
data_type
();
if
(
host_data_type
==
DataType
::
DT_HALF
)
{
host_data_type
=
DataType
::
DT_FLOAT
;
}
std
::
unique_ptr
<
Tensor
>
tensor
(
new
Tensor
(
alloc
,
host_data_type
,
true
,
const_tensor
.
name
()));
tensor
->
Resize
(
dims
);
MACE_CHECK
(
tensor
->
size
()
==
const_tensor
.
data_size
(),
"Tensor's data_size not equal with the shape"
);
if
(
const_tensor
.
data_type
()
==
DataType
::
DT_HALF
)
{
Tensor
::
MappingGuard
guard
(
tensor
.
get
());
float
*
dst_data
=
tensor
->
mutable_data
<
float
>
();
const
half
*
org_data
=
reinterpret_cast
<
const
half
*>
(
model_data
+
const_tensor
.
offset
());
for
(
index_t
i
=
0
;
i
<
const_tensor
.
data_size
();
++
i
)
{
dst_data
[
i
]
=
half_float
::
half_cast
<
float
>
(
org_data
[
i
]);
}
}
else
{
tensor
->
CopyBytes
(
model_data
+
const_tensor
.
offset
(),
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
()));
}
tensor_map_
[
const_tensor
.
name
()]
=
std
::
move
(
tensor
);
}
}
}
tensor_buffer_
.
reset
(
nullptr
);
...
...
mace/libmace/mace.cc
浏览文件 @
04b8524e
...
...
@@ -482,14 +482,14 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR
(
ws_
->
PreallocateOutputTensor
(
*
net_def
,
&
mem_optimizer
,
device_
.
get
()));
if
(
device_type_
==
DeviceType
::
GPU
)
{
ws_
->
RemoveAndReloadBuffer
(
*
net_def
,
model_data
,
device_
->
allocator
());
}
MACE_RETURN_IF_ERROR
(
net_
->
Init
());
#ifdef MACE_ENABLE_HEXAGON
}
#endif
if
(
device_type_
==
DeviceType
::
GPU
)
{
ws_
->
RemoveAndReloadBuffer
(
*
net_def
,
model_data
,
device_
->
allocator
());
}
return
MaceStatus
::
MACE_SUCCESS
;
}
...
...
mace/ops/channel_shuffle.cc
浏览文件 @
04b8524e
...
...
@@ -58,14 +58,12 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
#pragma omp parallel for collapse(2) schedule(runtime)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
T
*
input_base
=
input_ptr
+
b
*
batch_size
;
T
*
output_base
=
output_ptr
+
b
*
batch_size
;
index_t
g
=
c
%
groups_
;
index_t
idx
=
c
/
groups_
;
for
(
index_t
hw
=
0
;
hw
<
height
*
width
;
++
hw
)
{
output_base
[
c
*
image_size
+
hw
]
=
input_base
[
(
g
*
channels_per_group
+
idx
)
*
image_size
+
hw
]
;
}
const
T
*
in_ptr
=
input_ptr
+
b
*
batch_size
+
(
g
*
channels_per_group
+
idx
)
*
image_size
;
T
*
out_ptr
=
output_ptr
+
b
*
batch_size
+
c
*
image_size
;
memcpy
(
out_ptr
,
in_ptr
,
image_size
*
sizeof
(
float
));
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录