Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
59c2cfe1
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
59c2cfe1
编写于
4月 25, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'feature_wuch' into 'master'
cpu/neon memory optimize See merge request !422
上级
5c46f98d
a29b7fbc
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
147 addition
and
80 deletion
+147
-80
mace/core/workspace.cc
mace/core/workspace.cc
+28
-17
mace/core/workspace.h
mace/core/workspace.h
+1
-1
mace/python/tools/caffe_converter_lib.py
mace/python/tools/caffe_converter_lib.py
+5
-2
mace/python/tools/memory_optimizer.py
mace/python/tools/memory_optimizer.py
+108
-58
mace/python/tools/tf_converter_lib.py
mace/python/tools/tf_converter_lib.py
+5
-2
未找到文件。
mace/core/workspace.cc
浏览文件 @
59c2cfe1
...
...
@@ -119,19 +119,20 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
tensor_map_
[
const_tensor
.
name
()]
=
std
::
move
(
tensor
);
}
if
(
type
==
DeviceType
::
OPENCL
)
{
Create
ImageOutputTensor
(
net_def
);
if
(
type
==
DeviceType
::
CPU
||
type
==
DeviceType
::
OPENCL
)
{
Create
OutputTensorBuffer
(
net_def
,
type
);
}
}
void
Workspace
::
CreateImageOutputTensor
(
const
NetDef
&
net_def
)
{
void
Workspace
::
CreateOutputTensorBuffer
(
const
NetDef
&
net_def
,
DeviceType
device_type
)
{
if
(
!
net_def
.
has_mem_arena
()
||
net_def
.
mem_arena
().
mem_block_size
()
==
0
)
{
return
;
}
DataType
dtype
=
DataType
::
DT_INVALID
;
// We use the data type of the first op
(with mem id, must be image)
,
// as GPU have consistent data type for each layer for now.
// We use the data type of the first op
with mem id
,
// as
CPU&
GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for
(
auto
&
op
:
net_def
.
op
())
{
...
...
@@ -148,11 +149,19 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
MACE_CHECK
(
dtype
!=
DataType
::
DT_INVALID
,
"data type is invalid."
);
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
({
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
if
(
device_type
==
DeviceType
::
OPENCL
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
({
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
else
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
device_type
),
mem_block
.
x
()));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
}
VLOG
(
3
)
<<
"Preallocate
image
to tensors"
;
VLOG
(
3
)
<<
"Preallocate
buffer
to tensors"
;
for
(
auto
&
op
:
net_def
.
op
())
{
if
(
!
op
.
mem_id
().
empty
())
{
auto
mem_ids
=
op
.
mem_id
();
...
...
@@ -161,15 +170,17 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
std
::
unique_ptr
<
Tensor
>
tensor
(
new
Tensor
(
preallocated_allocator_
.
GetBuffer
(
mem_ids
[
i
]),
dtype
));
tensor
->
SetSourceOpName
(
op
.
name
());
VLOG
(
3
)
<<
"Tensor: "
<<
op
.
name
()
<<
"("
<<
op
.
type
()
<<
")"
<<
" Mem: "
<<
mem_ids
[
i
]
<<
" Image shape: "
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
0
]
<<
", "
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
1
];
tensor_map_
[
op
.
output
(
i
)]
=
std
::
move
(
tensor
);
if
(
device_type
==
DeviceType
::
OPENCL
)
{
VLOG
(
3
)
<<
"Tensor: "
<<
op
.
name
()
<<
"("
<<
op
.
type
()
<<
")"
<<
" Mem: "
<<
mem_ids
[
i
]
<<
" Image shape: "
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
0
]
<<
", "
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
1
];
}
}
}
}
...
...
mace/core/workspace.h
浏览文件 @
59c2cfe1
...
...
@@ -52,7 +52,7 @@ class Workspace {
ScratchBuffer
*
GetScratchBuffer
(
DeviceType
device_type
);
private:
void
Create
ImageOutputTensor
(
const
NetDef
&
net_def
);
void
Create
OutputTensorBuffer
(
const
NetDef
&
net_def
,
DeviceType
device_type
);
TensorMap
tensor_map_
;
...
...
mace/python/tools/caffe_converter_lib.py
浏览文件 @
59c2cfe1
...
...
@@ -1188,8 +1188,11 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str,
print
"PB Converted."
if
device
==
'gpu'
:
print
"start optimize memory."
mem_optimizer
=
memory_optimizer
.
MemoryOptimizer
(
net_def
)
mem_optimizer
.
optimize
()
memory_optimizer
.
optimize_gpu_memory
(
net_def
)
print
"Memory optimization done."
elif
device
==
'cpu'
:
print
"start optimize memory."
memory_optimizer
.
optimize_cpu_memory
(
net_def
)
print
"Memory optimization done."
return
net_def
mace/python/tools/memory_optimizer.py
浏览文件 @
59c2cfe1
...
...
@@ -22,13 +22,13 @@ class MemoryOptimizer(object):
self
.
net_def
=
net_def
self
.
idle_mem
=
set
()
self
.
op_mem
=
{}
# op_name->mem_id
self
.
mem_block
=
{}
# mem_id->[x, y]
self
.
mem_block
=
{}
# mem_id->[
size] or mem_id->[
x, y]
self
.
total_mem_count
=
0
self
.
ref_counter
=
{}
consumers
=
{}
for
op
in
net_def
.
op
:
if
self
.
is_buffer_image_op
(
op
):
if
not
self
.
op_need_optimize_memory
(
op
):
continue
for
ipt
in
op
.
input
:
if
ipt
not
in
consumers
:
...
...
@@ -36,7 +36,7 @@ class MemoryOptimizer(object):
consumers
[
ipt
].
append
(
op
)
# only ref op's output tensor
for
op
in
net_def
.
op
:
if
self
.
is_buffer_image_op
(
op
):
if
not
self
.
op_need_optimize_memory
(
op
):
continue
for
output
in
op
.
output
:
tensor_name
=
output
...
...
@@ -45,29 +45,47 @@ class MemoryOptimizer(object):
else
:
self
.
ref_counter
[
tensor_name
]
=
0
def
is_buffer_image_op
(
self
,
op
):
if
op
.
type
==
'BufferToImage'
:
for
arg
in
op
.
arg
:
if
arg
.
name
==
'mode'
and
arg
.
i
==
0
:
return
True
return
op
.
type
==
'ImageToBuffer'
def
op_need_optimize_memory
(
self
,
op
):
return
True
def
get_mem_size
(
self
,
op_type
,
output_shape
):
mem_size
=
[
0
,
0
]
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
mem_size
[
0
]
=
output_shape
[
2
]
*
output_shape
[
3
]
mem_size
[
1
]
=
output_shape
[
0
]
*
int
((
output_shape
[
1
]
+
3
)
/
4
)
else
:
mem_size
[
0
]
=
output_shape
[
2
]
*
int
((
output_shape
[
3
]
+
3
)
/
4
)
mem_size
[
1
]
=
output_shape
[
0
]
*
output_shape
[
1
]
return
mem_size
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
return
[
reduce
(
operator
.
mul
,
output_shape
,
1
)]
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
def
sub_mem_block
(
self
,
mem_block1
,
mem_block2
):
return
self
.
mem_size
(
mem_block1
)
-
self
.
mem_size
(
mem_block2
)
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
return
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
])]
def
add_net_mem_blocks
(
self
):
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
y
=
1
def
mem_area
(
self
,
memory_size
):
return
memory_size
[
0
]
*
memory_size
[
1
]
def
get_total_origin_mem_size
(
self
):
origin_mem_size
=
0
for
op
in
self
.
net_def
.
op
:
if
not
self
.
op_need_optimize_memory
(
op
):
continue
origin_mem_size
+=
reduce
(
operator
.
mul
,
op
.
output_shape
[
0
].
dims
,
1
)
return
origin_mem_size
def
get_total_optimized_mem_size
(
self
):
optimized_mem_size
=
0
for
mem
in
self
.
mem_block
:
print
mem
,
self
.
mem_block
[
mem
]
optimized_mem_size
+=
self
.
mem_size
(
self
.
mem_block
[
mem
])
return
optimized_mem_size
def
optimize
(
self
):
for
op
in
self
.
net_def
.
op
:
if
self
.
is_buffer_image_op
(
op
):
if
not
self
.
op_need_optimize_memory
(
op
):
continue
if
not
op
.
output_shape
:
print
(
'WARNING: There is no output shape information to '
...
...
@@ -78,38 +96,42 @@ class MemoryOptimizer(object):
'the number of output.'
)
return
for
i
in
range
(
len
(
op
.
output
)):
op_mem_
size
=
self
.
get_mem_size
(
op
.
type
,
op
.
output_shape
[
i
].
dims
)
op_mem_
block
=
self
.
get_op_mem_block
(
op
.
type
,
op
.
output_shape
[
i
].
dims
)
mem_id
=
-
1
if
len
(
self
.
idle_mem
)
>
0
:
best_mem_candidate_id
=
-
1
best_mem_candidate_delta_area
=
sys
.
maxint
best_mem_candidate_shape
=
[]
best_mem_add_size
=
sys
.
maxint
best_mem_waste_size
=
sys
.
maxint
for
mid
in
self
.
idle_mem
:
reuse_mem_size
=
self
.
mem_block
[
mid
]
resize_mem_size
=
[
max
(
reuse_mem_size
[
0
],
op_mem_size
[
0
]),
max
(
reuse_mem_size
[
1
],
op_mem_size
[
1
])
]
delta_mem_area
=
self
.
mem_area
(
resize_mem_size
)
-
self
.
mem_area
(
reuse_mem_size
)
if
delta_mem_area
<
best_mem_candidate_delta_area
:
best_mem_candidate_id
=
mid
best_mem_candidate_delta_area
=
delta_mem_area
best_mem_candidate_shape
=
resize_mem_size
if
best_mem_candidate_delta_area
<=
self
.
mem_area
(
op_mem_size
):
# reuse
self
.
mem_block
[
best_mem_candidate_id
]
=
best_mem_candidate_shape
mem_id
=
best_mem_candidate_id
old_mem_block
=
self
.
mem_block
[
mid
]
new_mem_block
=
self
.
resize_mem_block
(
old_mem_block
,
op_mem_block
)
add_mem_size
=
self
.
sub_mem_block
(
new_mem_block
,
old_mem_block
)
waste_mem_size
=
self
.
sub_mem_block
(
new_mem_block
,
op_mem_block
)
# minimize add_mem_size; if best_mem_add_size is 0,
# then minimize waste_mem_size
if
(
best_mem_add_size
>
0
and
add_mem_size
<
best_mem_add_size
)
\
or
(
best_mem_add_size
==
0
and
waste_mem_size
<
best_mem_waste_size
):
best_mem_id
=
mid
best_mem_add_size
=
add_mem_size
best_mem_waste_size
=
waste_mem_size
best_mem_block
=
new_mem_block
# if add mem size < op mem size, then reuse it
if
best_mem_add_size
<=
self
.
mem_size
(
op_mem_block
):
self
.
mem_block
[
best_mem_id
]
=
best_mem_block
mem_id
=
best_mem_id
self
.
idle_mem
.
remove
(
mem_id
)
if
mem_id
==
-
1
:
mem_id
=
self
.
total_mem_count
self
.
total_mem_count
+=
1
self
.
mem_block
[
mem_id
]
=
op_mem_
size
self
.
mem_block
[
mem_id
]
=
op_mem_
block
op
.
mem_id
.
extend
([
mem_id
])
self
.
op_mem
[
op
.
output
[
i
]]
=
mem_id
...
...
@@ -123,6 +145,43 @@ class MemoryOptimizer(object):
elif
self
.
ref_counter
[
ipt
]
<
0
:
raise
Exception
(
'ref count is less than 0'
)
self
.
add_net_mem_blocks
()
print
(
'total op: %d'
,
len
(
self
.
net_def
.
op
))
print
(
'origin mem: %d, optimized mem: %d'
,
self
.
get_total_origin_mem_size
(),
self
.
get_total_optimized_mem_size
())
class
GPUMemoryOptimizer
(
MemoryOptimizer
):
def
op_need_optimize_memory
(
self
,
op
):
if
op
.
type
==
'BufferToImage'
:
for
arg
in
op
.
arg
:
if
arg
.
name
==
'mode'
and
arg
.
i
==
0
:
return
False
return
op
.
type
!=
'ImageToBuffer'
def
get_op_mem_block
(
self
,
op_type
,
output_shape
):
mem_block
=
[
0
,
0
]
if
op_type
==
'WinogradTransform'
or
op_type
==
'MatMul'
:
mem_block
[
0
]
=
output_shape
[
2
]
*
output_shape
[
3
]
mem_block
[
1
]
=
output_shape
[
0
]
*
int
((
output_shape
[
1
]
+
3
)
/
4
)
else
:
mem_block
[
0
]
=
output_shape
[
2
]
*
int
((
output_shape
[
3
]
+
3
)
/
4
)
mem_block
[
1
]
=
output_shape
[
0
]
*
output_shape
[
1
]
return
mem_block
def
mem_size
(
self
,
memory_block
):
return
memory_block
[
0
]
*
memory_block
[
1
]
*
4
def
resize_mem_block
(
self
,
old_mem_block
,
op_mem_block
):
resize_mem_block
=
[
max
(
old_mem_block
[
0
],
op_mem_block
[
0
]),
max
(
old_mem_block
[
1
],
op_mem_block
[
1
])
]
return
resize_mem_block
def
add_net_mem_blocks
(
self
):
for
mem
in
self
.
mem_block
:
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
...
...
@@ -130,21 +189,12 @@ class MemoryOptimizer(object):
block
.
x
=
self
.
mem_block
[
mem
][
0
]
block
.
y
=
self
.
mem_block
[
mem
][
1
]
print
(
'total op: %d'
,
len
(
self
.
net_def
.
op
))
origin_mem_size
=
0
optimized_mem_size
=
0
for
op
in
self
.
net_def
.
op
:
if
self
.
is_buffer_image_op
(
op
):
continue
origin_mem_size
+=
reduce
(
operator
.
mul
,
op
.
output_shape
[
0
].
dims
,
1
)
for
mem
in
self
.
mem_block
:
print
mem
,
self
.
mem_block
[
mem
]
optimized_mem_size
+=
reduce
(
operator
.
mul
,
self
.
mem_block
[
mem
],
4
)
print
(
'origin mem: %d, optimized mem: %d'
,
origin_mem_size
,
optimized_mem_size
)
def
optimize_gpu_memory
(
net_def
):
mem_optimizer
=
GPUMemoryOptimizer
(
net_def
)
mem_optimizer
.
optimize
()
def
optimize_memory
(
net_def
):
def
optimize_
cpu_
memory
(
net_def
):
mem_optimizer
=
MemoryOptimizer
(
net_def
)
mem_optimizer
.
optimize
()
mace/python/tools/tf_converter_lib.py
浏览文件 @
59c2cfe1
...
...
@@ -1367,8 +1367,11 @@ def convert_to_mace_pb(model_file, input_node, input_shape, output_node,
print
"Model Converted."
if
device
==
'gpu'
:
print
"start optimize memory."
mem_optimizer
=
memory_optimizer
.
MemoryOptimizer
(
net_def
)
mem_optimizer
.
optimize
()
memory_optimizer
.
optimize_gpu_memory
(
net_def
)
print
"Memory optimization done."
elif
device
==
'cpu'
:
print
"start optimize memory."
memory_optimizer
.
optimize_cpu_memory
(
net_def
)
print
"Memory optimization done."
return
net_def
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录