diff --git a/mace/core/net.cc b/mace/core/net.cc index 6d8a751d16501ea678f6f6f71b700fab053a2687..259a9423bcfabbca3a52a0b3514551f45a14806c 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -41,7 +41,7 @@ SerialNet::SerialNet( MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); for (int idx = 0; idx < net_def->op_size(); ++idx) { const auto &operator_def = net_def->op(idx); - // TODO(liuqi): refactor based on PB + // TODO(liuqi): refactor to add device_type to OperatorDef const int op_device = ProtoArgHelper::GetOptionalArg( operator_def, "device", static_cast(device_type_)); diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 1e8826cd441a7a886f1f093fbef1504fa6f98ed5..fd083504bc496e6c8721244cffdaddf2dd75fb60 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -185,7 +185,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, // As DSP may have different data output type for each op, // we stick to the same concept. for (auto &op : net_def.op()) { - // TODO(liuqi): refactor based on PB + // TODO(liuqi): refactor to add device_type to OperatorDef const int op_device = ProtoArgHelper::GetOptionalArg( op, "device", static_cast(device_type)); @@ -204,33 +204,38 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, // TODO(liyin): memory block should not have concept of type, but to be // consistent with gpu, all memory block use float/half as unit for (auto &mem_block : net_def.mem_arena().mem_block()) { - if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { - std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(DeviceType::CPU))); - MACE_RETURN_IF_ERROR(tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype) - + MACE_EXTRA_BUFFER_PAD_SIZE)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(tensor_buf)); - } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { - std::unique_ptr image_buf( - new Image()); - MACE_RETURN_IF_ERROR(image_buf->Allocate( - {mem_block.x(), mem_block.y()}, dtype)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(image_buf)); - } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { - std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(DeviceType::GPU))); - MACE_RETURN_IF_ERROR(tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype))); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(tensor_buf)); + if (mem_block.device_type() == device_type) { + VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id() + << ", device type: " << mem_block.device_type() + << ", memory type: " << mem_block.mem_type(); + if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { + std::unique_ptr tensor_buf( + new Buffer(GetDeviceAllocator(DeviceType::CPU))); + MACE_RETURN_IF_ERROR(tensor_buf->Allocate( + mem_block.x() * GetEnumTypeSize(dtype) + + MACE_EXTRA_BUFFER_PAD_SIZE)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { + std::unique_ptr image_buf( + new Image()); + MACE_RETURN_IF_ERROR(image_buf->Allocate( + {mem_block.x(), mem_block.y()}, dtype)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(image_buf)); + } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { + std::unique_ptr tensor_buf( + new Buffer(GetDeviceAllocator(DeviceType::GPU))); + MACE_RETURN_IF_ERROR(tensor_buf->Allocate( + mem_block.x() * GetEnumTypeSize(dtype))); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } } } VLOG(3) << "Preallocate buffer to tensors"; for (auto &op : net_def.op()) { - // TODO(liuqi): refactor based on PB + // TODO(liuqi): refactor to add device_type to OperatorDef const int op_device = ProtoArgHelper::GetOptionalArg( op, "device", static_cast(device_type)); diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 08d862ea171cf1a31bda86d2c53f798e5f7e5ede..63115a86a6602929bdaacf5b02d0c5c6080d7213 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -79,9 +79,10 @@ message OperatorDef { // for memory optimization message MemoryBlock { optional int32 mem_id = 1; - optional MemoryType mem_type = 2; - optional uint32 x = 3; - optional uint32 y = 4; + optional int32 device_type = 2; + optional MemoryType mem_type = 3; + optional uint32 x = 4; + optional uint32 y = 5; } message MemoryArena { repeated MemoryBlock mem_block = 1; diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index 0c18a66bc76bf80421d24e74c7f3b8961ff5ddd6..36ee96074121ec009b3bd9032f6d15fffd5c5a5d 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -17,10 +17,22 @@ import operator from mace.proto import mace_pb2 from mace.python.tools.converter_tool import base_converter as cvt +from mace.python.tools.converter_tool.base_converter import DeviceType from mace.python.tools.convert_util import calculate_image_shape from mace.python.tools.convert_util import OpenCLBufferType +def MemoryTypeToStr(mem_type): + if mem_type == mace_pb2.CPU_BUFFER: + return 'CPU_BUFFER' + elif mem_type == mace_pb2.GPU_BUFFER: + return 'GPU_BUFFER' + elif mem_type == mace_pb2.GPU_IMAGE: + return 'GPU_IMAGE' + else: + return 'UNKNOWN' + + class MemoryBlock(object): def __init__(self, mem_type, block): self._mem_type = mem_type @@ -88,6 +100,7 @@ class MemoryOptimizer(object): arena = self.net_def.mem_arena block = arena.mem_block.add() block.mem_id = mem + block.device_type = DeviceType.CPU.value block.mem_type = self.mem_block[mem].mem_type block.x = self.mem_block[mem].block[0] block.y = 1 @@ -103,7 +116,8 @@ class MemoryOptimizer(object): def get_total_optimized_mem_size(self): optimized_mem_size = 0 for mem in self.mem_block: - print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block + print mem, MemoryTypeToStr(self.mem_block[mem].mem_type), \ + self.mem_block[mem].block optimized_mem_size += self.mem_size(self.mem_block[mem]) return optimized_mem_size @@ -165,7 +179,7 @@ class MemoryOptimizer(object): self.idle_mem.remove(mem_id) if mem_id == -1: - mem_id = self.mem_id_base() + self.total_mem_count + mem_id = self.total_mem_count self.total_mem_count += 1 self.mem_block[mem_id] = op_mem_block @@ -198,9 +212,6 @@ class MemoryOptimizer(object): self.get_total_origin_mem_size(), self.get_total_optimized_mem_size())) - def mem_id_base(self): - return 0 - class GPUMemoryOptimizer(MemoryOptimizer): def op_need_optimize_memory(self, op): @@ -256,6 +267,7 @@ class GPUMemoryOptimizer(MemoryOptimizer): arena = self.net_def.mem_arena block = arena.mem_block.add() block.mem_id = mem + block.device_type = DeviceType.GPU.value block.mem_type = self.mem_block[mem].mem_type block.x = self.mem_block[mem].block[0] block.y = self.mem_block[mem].block[1] @@ -279,9 +291,6 @@ class GPUMemoryOptimizer(MemoryOptimizer): net_ocl_max_img_size_arg.ints[:] = [max_image_size_x, max_image_size_y] - def mem_id_base(self): - return 20000 - def optimize_gpu_memory(net_def): mem_optimizer = GPUMemoryOptimizer(net_def) diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 267911b9ee6c8ea0a68cfb60e57b60f38d36937e..efb1c359f9ee8fd80d994da4b7c625787c69d14a 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) { mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block(); mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}}); + mem_block{{i}}->set_device_type(static_cast({{net.mem_arena.mem_block[i].device_type}})); mem_block{{i}}->set_mem_type(static_cast({{net.mem_arena.mem_block[i].mem_type}})); mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}}); mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}}); diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 2280ae35abf8b1b0296daa5bff66c33803c4f98c..27c601fe8410d57adef4a0179d70f14e8d8ade4e 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -224,8 +224,7 @@ std::map AddMemoryOptimization( const std::vector> &output_shapes, NetDef *net_def) { std::map res; - // TODO(liuqi) refactor based on PB - int mem_id = 20000; + int mem_id = 0; size_t input_shape_size = input_shapes.size(); uint32_t in_mem_block_x = 0; uint32_t in_mem_block_y = 0; @@ -244,6 +243,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < input_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_device_type(DeviceType::GPU); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_y(in_mem_block_y); @@ -264,6 +264,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < output_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_device_type(DeviceType::GPU); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_y(out_mem_block_y); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 9f929de6f8ec562eb8cbf09b52b2372ceefa2879..46bd9fe1f9306325f3b82a35ab877c89e7af7162 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -225,8 +225,7 @@ std::map AddMemoryOptimization( const std::vector> &output_shapes, NetDef *net_def) { std::map res; - // TODO(liuqi) refactor based on PB - int mem_id = 20000; + int mem_id = 0; size_t input_shape_size = input_shapes.size(); uint32_t in_mem_block_x = 0; uint32_t in_mem_block_y = 0; @@ -245,6 +244,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < input_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_device_type(DeviceType::GPU); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_y(in_mem_block_y); @@ -265,6 +265,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < output_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_device_type(DeviceType::GPU); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_y(out_mem_block_y);