提交 a29b7fbc 编写于 作者: W wuchenghui

cpu/neon memory optimize

上级 5c46f98d
...@@ -119,19 +119,20 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -119,19 +119,20 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
tensor_map_[const_tensor.name()] = std::move(tensor); tensor_map_[const_tensor.name()] = std::move(tensor);
} }
if (type == DeviceType::OPENCL) { if (type == DeviceType::CPU || type == DeviceType::OPENCL) {
CreateImageOutputTensor(net_def); CreateOutputTensorBuffer(net_def, type);
} }
} }
void Workspace::CreateImageOutputTensor(const NetDef &net_def) { void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type) {
if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) { if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
return; return;
} }
DataType dtype = DataType::DT_INVALID; DataType dtype = DataType::DT_INVALID;
// We use the data type of the first op (with mem id, must be image), // We use the data type of the first op with mem id,
// as GPU have consistent data type for each layer for now. // as CPU&GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op, // As DSP may have different data output type for each op,
// we stick to the same concept. // we stick to the same concept.
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
...@@ -148,11 +149,19 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -148,11 +149,19 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
} }
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block : net_def.mem_arena().mem_block()) { for (auto &mem_block : net_def.mem_arena().mem_block()) {
std::unique_ptr<BufferBase> image_buf( if (device_type == DeviceType::OPENCL) {
new Image({mem_block.x(), mem_block.y()}, dtype)); std::unique_ptr<BufferBase> image_buf(
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type), mem_block.x()));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
} }
VLOG(3) << "Preallocate image to tensors"; VLOG(3) << "Preallocate buffer to tensors";
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
if (!op.mem_id().empty()) { if (!op.mem_id().empty()) {
auto mem_ids = op.mem_id(); auto mem_ids = op.mem_id();
...@@ -161,15 +170,17 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -161,15 +170,17 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype)); (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
tensor->SetSourceOpName(op.name()); tensor->SetSourceOpName(op.name());
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor_map_[op.output(i)] = std::move(tensor); tensor_map_[op.output(i)] = std::move(tensor);
if (device_type == DeviceType::OPENCL) {
VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
}
} }
} }
} }
......
...@@ -52,7 +52,7 @@ class Workspace { ...@@ -52,7 +52,7 @@ class Workspace {
ScratchBuffer *GetScratchBuffer(DeviceType device_type); ScratchBuffer *GetScratchBuffer(DeviceType device_type);
private: private:
void CreateImageOutputTensor(const NetDef &net_def); void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type);
TensorMap tensor_map_; TensorMap tensor_map_;
......
...@@ -1188,8 +1188,11 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, ...@@ -1188,8 +1188,11 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str,
print "PB Converted." print "PB Converted."
if device == 'gpu': if device == 'gpu':
print "start optimize memory." print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) memory_optimizer.optimize_gpu_memory(net_def)
mem_optimizer.optimize() print "Memory optimization done."
elif device == 'cpu':
print "start optimize memory."
memory_optimizer.optimize_cpu_memory(net_def)
print "Memory optimization done." print "Memory optimization done."
return net_def return net_def
...@@ -22,13 +22,13 @@ class MemoryOptimizer(object): ...@@ -22,13 +22,13 @@ class MemoryOptimizer(object):
self.net_def = net_def self.net_def = net_def
self.idle_mem = set() self.idle_mem = set()
self.op_mem = {} # op_name->mem_id self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y] self.mem_block = {} # mem_id->[size] or mem_id->[x, y]
self.total_mem_count = 0 self.total_mem_count = 0
self.ref_counter = {} self.ref_counter = {}
consumers = {} consumers = {}
for op in net_def.op: for op in net_def.op:
if self.is_buffer_image_op(op): if not self.op_need_optimize_memory(op):
continue continue
for ipt in op.input: for ipt in op.input:
if ipt not in consumers: if ipt not in consumers:
...@@ -36,7 +36,7 @@ class MemoryOptimizer(object): ...@@ -36,7 +36,7 @@ class MemoryOptimizer(object):
consumers[ipt].append(op) consumers[ipt].append(op)
# only ref op's output tensor # only ref op's output tensor
for op in net_def.op: for op in net_def.op:
if self.is_buffer_image_op(op): if not self.op_need_optimize_memory(op):
continue continue
for output in op.output: for output in op.output:
tensor_name = output tensor_name = output
...@@ -45,29 +45,47 @@ class MemoryOptimizer(object): ...@@ -45,29 +45,47 @@ class MemoryOptimizer(object):
else: else:
self.ref_counter[tensor_name] = 0 self.ref_counter[tensor_name] = 0
def is_buffer_image_op(self, op): def op_need_optimize_memory(self, op):
if op.type == 'BufferToImage': return True
for arg in op.arg:
if arg.name == 'mode' and arg.i == 0:
return True
return op.type == 'ImageToBuffer'
def get_mem_size(self, op_type, output_shape): def get_op_mem_block(self, op_type, output_shape):
mem_size = [0, 0] return [reduce(operator.mul, output_shape, 1)]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_size[0] = output_shape[2] * output_shape[3] def mem_size(self, memory_block):
mem_size[1] = output_shape[0] * int((output_shape[1] + 3) / 4) return memory_block[0]
else:
mem_size[0] = output_shape[2] * int((output_shape[3] + 3) / 4) def sub_mem_block(self, mem_block1, mem_block2):
mem_size[1] = output_shape[0] * output_shape[1] return self.mem_size(mem_block1) - self.mem_size(mem_block2)
return mem_size
def resize_mem_block(self, old_mem_block, op_mem_block):
return [max(old_mem_block[0], op_mem_block[0])]
def add_net_mem_blocks(self):
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = 1
def mem_area(self, memory_size): def get_total_origin_mem_size(self):
return memory_size[0] * memory_size[1] origin_mem_size = 0
for op in self.net_def.op:
if not self.op_need_optimize_memory(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
return origin_mem_size
def get_total_optimized_mem_size(self):
optimized_mem_size = 0
for mem in self.mem_block:
print mem, self.mem_block[mem]
optimized_mem_size += self.mem_size(self.mem_block[mem])
return optimized_mem_size
def optimize(self): def optimize(self):
for op in self.net_def.op: for op in self.net_def.op:
if self.is_buffer_image_op(op): if not self.op_need_optimize_memory(op):
continue continue
if not op.output_shape: if not op.output_shape:
print('WARNING: There is no output shape information to ' print('WARNING: There is no output shape information to '
...@@ -78,38 +96,42 @@ class MemoryOptimizer(object): ...@@ -78,38 +96,42 @@ class MemoryOptimizer(object):
'the number of output.') 'the number of output.')
return return
for i in range(len(op.output)): for i in range(len(op.output)):
op_mem_size = self.get_mem_size(op.type, op_mem_block = self.get_op_mem_block(op.type,
op.output_shape[i].dims) op.output_shape[i].dims)
mem_id = -1 mem_id = -1
if len(self.idle_mem) > 0: if len(self.idle_mem) > 0:
best_mem_candidate_id = -1 best_mem_add_size = sys.maxint
best_mem_candidate_delta_area = sys.maxint best_mem_waste_size = sys.maxint
best_mem_candidate_shape = []
for mid in self.idle_mem: for mid in self.idle_mem:
reuse_mem_size = self.mem_block[mid] old_mem_block = self.mem_block[mid]
resize_mem_size = [ new_mem_block = self.resize_mem_block(
max(reuse_mem_size[0], op_mem_size[0]), old_mem_block, op_mem_block)
max(reuse_mem_size[1], op_mem_size[1]) add_mem_size = self.sub_mem_block(new_mem_block,
] old_mem_block)
delta_mem_area = self.mem_area( waste_mem_size = self.sub_mem_block(new_mem_block,
resize_mem_size) - self.mem_area(reuse_mem_size) op_mem_block)
if delta_mem_area < best_mem_candidate_delta_area:
best_mem_candidate_id = mid # minimize add_mem_size; if best_mem_add_size is 0,
best_mem_candidate_delta_area = delta_mem_area # then minimize waste_mem_size
best_mem_candidate_shape = resize_mem_size if (best_mem_add_size > 0 and
add_mem_size < best_mem_add_size) \
if best_mem_candidate_delta_area <= self.mem_area( or (best_mem_add_size == 0 and
op_mem_size): waste_mem_size < best_mem_waste_size):
# reuse best_mem_id = mid
self.mem_block[ best_mem_add_size = add_mem_size
best_mem_candidate_id] = best_mem_candidate_shape best_mem_waste_size = waste_mem_size
mem_id = best_mem_candidate_id best_mem_block = new_mem_block
# if add mem size < op mem size, then reuse it
if best_mem_add_size <= self.mem_size(op_mem_block):
self.mem_block[best_mem_id] = best_mem_block
mem_id = best_mem_id
self.idle_mem.remove(mem_id) self.idle_mem.remove(mem_id)
if mem_id == -1: if mem_id == -1:
mem_id = self.total_mem_count mem_id = self.total_mem_count
self.total_mem_count += 1 self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_size self.mem_block[mem_id] = op_mem_block
op.mem_id.extend([mem_id]) op.mem_id.extend([mem_id])
self.op_mem[op.output[i]] = mem_id self.op_mem[op.output[i]] = mem_id
...@@ -123,6 +145,43 @@ class MemoryOptimizer(object): ...@@ -123,6 +145,43 @@ class MemoryOptimizer(object):
elif self.ref_counter[ipt] < 0: elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0') raise Exception('ref count is less than 0')
self.add_net_mem_blocks()
print('total op: %d', len(self.net_def.op))
print('origin mem: %d, optimized mem: %d',
self.get_total_origin_mem_size(),
self.get_total_optimized_mem_size())
class GPUMemoryOptimizer(MemoryOptimizer):
def op_need_optimize_memory(self, op):
if op.type == 'BufferToImage':
for arg in op.arg:
if arg.name == 'mode' and arg.i == 0:
return False
return op.type != 'ImageToBuffer'
def get_op_mem_block(self, op_type, output_shape):
mem_block = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_block[0] = output_shape[2] * output_shape[3]
mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
else:
mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
mem_block[1] = output_shape[0] * output_shape[1]
return mem_block
def mem_size(self, memory_block):
return memory_block[0] * memory_block[1] * 4
def resize_mem_block(self, old_mem_block, op_mem_block):
resize_mem_block = [
max(old_mem_block[0], op_mem_block[0]),
max(old_mem_block[1], op_mem_block[1])
]
return resize_mem_block
def add_net_mem_blocks(self):
for mem in self.mem_block: for mem in self.mem_block:
arena = self.net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
...@@ -130,21 +189,12 @@ class MemoryOptimizer(object): ...@@ -130,21 +189,12 @@ class MemoryOptimizer(object):
block.x = self.mem_block[mem][0] block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1] block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
print mem, self.mem_block[mem]
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size, def optimize_gpu_memory(net_def):
optimized_mem_size) mem_optimizer = GPUMemoryOptimizer(net_def)
mem_optimizer.optimize()
def optimize_memory(net_def): def optimize_cpu_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def) mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize() mem_optimizer.optimize()
...@@ -1367,8 +1367,11 @@ def convert_to_mace_pb(model_file, input_node, input_shape, output_node, ...@@ -1367,8 +1367,11 @@ def convert_to_mace_pb(model_file, input_node, input_shape, output_node,
print "Model Converted." print "Model Converted."
if device == 'gpu': if device == 'gpu':
print "start optimize memory." print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) memory_optimizer.optimize_gpu_memory(net_def)
mem_optimizer.optimize() print "Memory optimization done."
elif device == 'cpu':
print "start optimize memory."
memory_optimizer.optimize_cpu_memory(net_def)
print "Memory optimization done." print "Memory optimization done."
return net_def return net_def
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册