diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 7a3bd994fa8baaae98a5878f92c73c0ef6ca74ae..c369b15cd00d4cde99bbb172d468d1e4b0147c9f 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -119,19 +119,20 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { tensor_map_[const_tensor.name()] = std::move(tensor); } - if (type == DeviceType::OPENCL) { - CreateImageOutputTensor(net_def); + if (type == DeviceType::CPU || type == DeviceType::OPENCL) { + CreateOutputTensorBuffer(net_def, type); } } -void Workspace::CreateImageOutputTensor(const NetDef &net_def) { +void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, + DeviceType device_type) { if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) { return; } DataType dtype = DataType::DT_INVALID; - // We use the data type of the first op (with mem id, must be image), - // as GPU have consistent data type for each layer for now. + // We use the data type of the first op with mem id, + // as CPU&GPU have consistent data type for each layer for now. // As DSP may have different data output type for each op, // we stick to the same concept. for (auto &op : net_def.op()) { @@ -148,11 +149,19 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { } MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); for (auto &mem_block : net_def.mem_arena().mem_block()) { - std::unique_ptr image_buf( - new Image({mem_block.x(), mem_block.y()}, dtype)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); + if (device_type == DeviceType::OPENCL) { + std::unique_ptr image_buf( + new Image({mem_block.x(), mem_block.y()}, dtype)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(image_buf)); + } else { + std::unique_ptr tensor_buf( + new Buffer(GetDeviceAllocator(device_type), mem_block.x())); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } } - VLOG(3) << "Preallocate image to tensors"; + VLOG(3) << "Preallocate buffer to tensors"; for (auto &op : net_def.op()) { if (!op.mem_id().empty()) { auto mem_ids = op.mem_id(); @@ -161,15 +170,17 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { std::unique_ptr tensor (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype)); tensor->SetSourceOpName(op.name()); - VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" - << " Mem: " << mem_ids[i] - << " Image shape: " - << dynamic_cast(tensor->UnderlyingBuffer()) - ->image_shape()[0] - << ", " - << dynamic_cast(tensor->UnderlyingBuffer()) - ->image_shape()[1]; tensor_map_[op.output(i)] = std::move(tensor); + if (device_type == DeviceType::OPENCL) { + VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" + << " Mem: " << mem_ids[i] + << " Image shape: " + << dynamic_cast(tensor->UnderlyingBuffer()) + ->image_shape()[0] + << ", " + << dynamic_cast(tensor->UnderlyingBuffer()) + ->image_shape()[1]; + } } } } diff --git a/mace/core/workspace.h b/mace/core/workspace.h index b4b75995d25de4cabd90285d848baa4191c846ba..e9e11ea38810a8fd71c9871c07cb199c803d3dd2 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -52,7 +52,7 @@ class Workspace { ScratchBuffer *GetScratchBuffer(DeviceType device_type); private: - void CreateImageOutputTensor(const NetDef &net_def); + void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type); TensorMap tensor_map_; diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 160579720b422d74a94eb8f8ba7fa59866b70604..cc961c36f334ab2c5080b34cfec41c6e210bbf98 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -1188,8 +1188,11 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, print "PB Converted." if device == 'gpu': print "start optimize memory." - mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) - mem_optimizer.optimize() + memory_optimizer.optimize_gpu_memory(net_def) + print "Memory optimization done." + elif device == 'cpu': + print "start optimize memory." + memory_optimizer.optimize_cpu_memory(net_def) print "Memory optimization done." return net_def diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index fddb50e276d9f23f00ced9b666681467585283ee..38e3a36b6550f76f441983fb2826fa7b2268a0a5 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -22,13 +22,13 @@ class MemoryOptimizer(object): self.net_def = net_def self.idle_mem = set() self.op_mem = {} # op_name->mem_id - self.mem_block = {} # mem_id->[x, y] + self.mem_block = {} # mem_id->[size] or mem_id->[x, y] self.total_mem_count = 0 self.ref_counter = {} consumers = {} for op in net_def.op: - if self.is_buffer_image_op(op): + if not self.op_need_optimize_memory(op): continue for ipt in op.input: if ipt not in consumers: @@ -36,7 +36,7 @@ class MemoryOptimizer(object): consumers[ipt].append(op) # only ref op's output tensor for op in net_def.op: - if self.is_buffer_image_op(op): + if not self.op_need_optimize_memory(op): continue for output in op.output: tensor_name = output @@ -45,29 +45,47 @@ class MemoryOptimizer(object): else: self.ref_counter[tensor_name] = 0 - def is_buffer_image_op(self, op): - if op.type == 'BufferToImage': - for arg in op.arg: - if arg.name == 'mode' and arg.i == 0: - return True - return op.type == 'ImageToBuffer' + def op_need_optimize_memory(self, op): + return True - def get_mem_size(self, op_type, output_shape): - mem_size = [0, 0] - if op_type == 'WinogradTransform' or op_type == 'MatMul': - mem_size[0] = output_shape[2] * output_shape[3] - mem_size[1] = output_shape[0] * int((output_shape[1] + 3) / 4) - else: - mem_size[0] = output_shape[2] * int((output_shape[3] + 3) / 4) - mem_size[1] = output_shape[0] * output_shape[1] - return mem_size + def get_op_mem_block(self, op_type, output_shape): + return [reduce(operator.mul, output_shape, 1)] + + def mem_size(self, memory_block): + return memory_block[0] + + def sub_mem_block(self, mem_block1, mem_block2): + return self.mem_size(mem_block1) - self.mem_size(mem_block2) + + def resize_mem_block(self, old_mem_block, op_mem_block): + return [max(old_mem_block[0], op_mem_block[0])] + + def add_net_mem_blocks(self): + for mem in self.mem_block: + arena = self.net_def.mem_arena + block = arena.mem_block.add() + block.mem_id = mem + block.x = self.mem_block[mem][0] + block.y = 1 - def mem_area(self, memory_size): - return memory_size[0] * memory_size[1] + def get_total_origin_mem_size(self): + origin_mem_size = 0 + for op in self.net_def.op: + if not self.op_need_optimize_memory(op): + continue + origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) + return origin_mem_size + + def get_total_optimized_mem_size(self): + optimized_mem_size = 0 + for mem in self.mem_block: + print mem, self.mem_block[mem] + optimized_mem_size += self.mem_size(self.mem_block[mem]) + return optimized_mem_size def optimize(self): for op in self.net_def.op: - if self.is_buffer_image_op(op): + if not self.op_need_optimize_memory(op): continue if not op.output_shape: print('WARNING: There is no output shape information to ' @@ -78,38 +96,42 @@ class MemoryOptimizer(object): 'the number of output.') return for i in range(len(op.output)): - op_mem_size = self.get_mem_size(op.type, - op.output_shape[i].dims) + op_mem_block = self.get_op_mem_block(op.type, + op.output_shape[i].dims) mem_id = -1 if len(self.idle_mem) > 0: - best_mem_candidate_id = -1 - best_mem_candidate_delta_area = sys.maxint - best_mem_candidate_shape = [] + best_mem_add_size = sys.maxint + best_mem_waste_size = sys.maxint for mid in self.idle_mem: - reuse_mem_size = self.mem_block[mid] - resize_mem_size = [ - max(reuse_mem_size[0], op_mem_size[0]), - max(reuse_mem_size[1], op_mem_size[1]) - ] - delta_mem_area = self.mem_area( - resize_mem_size) - self.mem_area(reuse_mem_size) - if delta_mem_area < best_mem_candidate_delta_area: - best_mem_candidate_id = mid - best_mem_candidate_delta_area = delta_mem_area - best_mem_candidate_shape = resize_mem_size - - if best_mem_candidate_delta_area <= self.mem_area( - op_mem_size): - # reuse - self.mem_block[ - best_mem_candidate_id] = best_mem_candidate_shape - mem_id = best_mem_candidate_id + old_mem_block = self.mem_block[mid] + new_mem_block = self.resize_mem_block( + old_mem_block, op_mem_block) + add_mem_size = self.sub_mem_block(new_mem_block, + old_mem_block) + waste_mem_size = self.sub_mem_block(new_mem_block, + op_mem_block) + + # minimize add_mem_size; if best_mem_add_size is 0, + # then minimize waste_mem_size + if (best_mem_add_size > 0 and + add_mem_size < best_mem_add_size) \ + or (best_mem_add_size == 0 and + waste_mem_size < best_mem_waste_size): + best_mem_id = mid + best_mem_add_size = add_mem_size + best_mem_waste_size = waste_mem_size + best_mem_block = new_mem_block + + # if add mem size < op mem size, then reuse it + if best_mem_add_size <= self.mem_size(op_mem_block): + self.mem_block[best_mem_id] = best_mem_block + mem_id = best_mem_id self.idle_mem.remove(mem_id) if mem_id == -1: mem_id = self.total_mem_count self.total_mem_count += 1 - self.mem_block[mem_id] = op_mem_size + self.mem_block[mem_id] = op_mem_block op.mem_id.extend([mem_id]) self.op_mem[op.output[i]] = mem_id @@ -123,6 +145,43 @@ class MemoryOptimizer(object): elif self.ref_counter[ipt] < 0: raise Exception('ref count is less than 0') + self.add_net_mem_blocks() + + print('total op: %d', len(self.net_def.op)) + print('origin mem: %d, optimized mem: %d', + self.get_total_origin_mem_size(), + self.get_total_optimized_mem_size()) + + +class GPUMemoryOptimizer(MemoryOptimizer): + def op_need_optimize_memory(self, op): + if op.type == 'BufferToImage': + for arg in op.arg: + if arg.name == 'mode' and arg.i == 0: + return False + return op.type != 'ImageToBuffer' + + def get_op_mem_block(self, op_type, output_shape): + mem_block = [0, 0] + if op_type == 'WinogradTransform' or op_type == 'MatMul': + mem_block[0] = output_shape[2] * output_shape[3] + mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4) + else: + mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4) + mem_block[1] = output_shape[0] * output_shape[1] + return mem_block + + def mem_size(self, memory_block): + return memory_block[0] * memory_block[1] * 4 + + def resize_mem_block(self, old_mem_block, op_mem_block): + resize_mem_block = [ + max(old_mem_block[0], op_mem_block[0]), + max(old_mem_block[1], op_mem_block[1]) + ] + return resize_mem_block + + def add_net_mem_blocks(self): for mem in self.mem_block: arena = self.net_def.mem_arena block = arena.mem_block.add() @@ -130,21 +189,12 @@ class MemoryOptimizer(object): block.x = self.mem_block[mem][0] block.y = self.mem_block[mem][1] - print('total op: %d', len(self.net_def.op)) - origin_mem_size = 0 - optimized_mem_size = 0 - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) - for mem in self.mem_block: - print mem, self.mem_block[mem] - optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) - print('origin mem: %d, optimized mem: %d', origin_mem_size, - optimized_mem_size) +def optimize_gpu_memory(net_def): + mem_optimizer = GPUMemoryOptimizer(net_def) + mem_optimizer.optimize() -def optimize_memory(net_def): +def optimize_cpu_memory(net_def): mem_optimizer = MemoryOptimizer(net_def) mem_optimizer.optimize() diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 780dfa4d5e2aa99878b55bf3b3681c8bc20fdfe6..0f0ca20b4be8f83aebca776e742dc8d65e33ffdd 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -1367,8 +1367,11 @@ def convert_to_mace_pb(model_file, input_node, input_shape, output_node, print "Model Converted." if device == 'gpu': print "start optimize memory." - mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) - mem_optimizer.optimize() + memory_optimizer.optimize_gpu_memory(net_def) + print "Memory optimization done." + elif device == 'cpu': + print "start optimize memory." + memory_optimizer.optimize_cpu_memory(net_def) print "Memory optimization done." return net_def