提交 6ac7c5f5 编写于 作者: L liuqi

Add device_type to MemoryBlock and fix workspace preallocation bug.

上级 0d94aeae
...@@ -41,7 +41,7 @@ SerialNet::SerialNet( ...@@ -41,7 +41,7 @@ SerialNet::SerialNet(
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); const auto &operator_def = net_def->op(idx);
// TODO(liuqi): refactor based on PB // TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device = const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
operator_def, "device", static_cast<int>(device_type_)); operator_def, "device", static_cast<int>(device_type_));
......
...@@ -185,7 +185,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -185,7 +185,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// As DSP may have different data output type for each op, // As DSP may have different data output type for each op,
// we stick to the same concept. // we stick to the same concept.
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
// TODO(liuqi): refactor based on PB // TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device = const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type)); op, "device", static_cast<int>(device_type));
...@@ -204,33 +204,38 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -204,33 +204,38 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be // TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit // consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) { for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { if (mem_block.device_type() == device_type) {
std::unique_ptr<BufferBase> tensor_buf( VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
new Buffer(GetDeviceAllocator(DeviceType::CPU))); << ", device type: " << mem_block.device_type()
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( << ", memory type: " << mem_block.mem_type();
mem_block.x() * GetEnumTypeSize(dtype) if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
+ MACE_EXTRA_BUFFER_PAD_SIZE)); std::unique_ptr<BufferBase> tensor_buf(
preallocated_allocator_.SetBuffer(mem_block.mem_id(), new Buffer(GetDeviceAllocator(DeviceType::CPU)));
std::move(tensor_buf)); MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { mem_block.x() * GetEnumTypeSize(dtype)
std::unique_ptr<BufferBase> image_buf( + MACE_EXTRA_BUFFER_PAD_SIZE));
new Image()); preallocated_allocator_.SetBuffer(mem_block.mem_id(),
MACE_RETURN_IF_ERROR(image_buf->Allocate( std::move(tensor_buf));
{mem_block.x(), mem_block.y()}, dtype)); } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::unique_ptr<BufferBase> image_buf(
std::move(image_buf)); new Image());
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { MACE_RETURN_IF_ERROR(image_buf->Allocate(
std::unique_ptr<BufferBase> tensor_buf( {mem_block.x(), mem_block.y()}, dtype));
new Buffer(GetDeviceAllocator(DeviceType::GPU))); preallocated_allocator_.SetBuffer(mem_block.mem_id(),
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( std::move(image_buf));
mem_block.x() * GetEnumTypeSize(dtype))); } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::unique_ptr<BufferBase> tensor_buf(
std::move(tensor_buf)); new Buffer(GetDeviceAllocator(DeviceType::GPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
} }
} }
VLOG(3) << "Preallocate buffer to tensors"; VLOG(3) << "Preallocate buffer to tensors";
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
// TODO(liuqi): refactor based on PB // TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device = const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type)); op, "device", static_cast<int>(device_type));
......
...@@ -79,9 +79,10 @@ message OperatorDef { ...@@ -79,9 +79,10 @@ message OperatorDef {
// for memory optimization // for memory optimization
message MemoryBlock { message MemoryBlock {
optional int32 mem_id = 1; optional int32 mem_id = 1;
optional MemoryType mem_type = 2; optional int32 device_type = 2;
optional uint32 x = 3; optional MemoryType mem_type = 3;
optional uint32 y = 4; optional uint32 x = 4;
optional uint32 y = 5;
} }
message MemoryArena { message MemoryArena {
repeated MemoryBlock mem_block = 1; repeated MemoryBlock mem_block = 1;
......
...@@ -17,10 +17,22 @@ import operator ...@@ -17,10 +17,22 @@ import operator
from mace.proto import mace_pb2 from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter as cvt from mace.python.tools.converter_tool import base_converter as cvt
from mace.python.tools.converter_tool.base_converter import DeviceType
from mace.python.tools.convert_util import calculate_image_shape from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import OpenCLBufferType from mace.python.tools.convert_util import OpenCLBufferType
def MemoryTypeToStr(mem_type):
if mem_type == mace_pb2.CPU_BUFFER:
return 'CPU_BUFFER'
elif mem_type == mace_pb2.GPU_BUFFER:
return 'GPU_BUFFER'
elif mem_type == mace_pb2.GPU_IMAGE:
return 'GPU_IMAGE'
else:
return 'UNKNOWN'
class MemoryBlock(object): class MemoryBlock(object):
def __init__(self, mem_type, block): def __init__(self, mem_type, block):
self._mem_type = mem_type self._mem_type = mem_type
...@@ -88,6 +100,7 @@ class MemoryOptimizer(object): ...@@ -88,6 +100,7 @@ class MemoryOptimizer(object):
arena = self.net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
block.mem_id = mem block.mem_id = mem
block.device_type = DeviceType.CPU.value
block.mem_type = self.mem_block[mem].mem_type block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0] block.x = self.mem_block[mem].block[0]
block.y = 1 block.y = 1
...@@ -103,7 +116,8 @@ class MemoryOptimizer(object): ...@@ -103,7 +116,8 @@ class MemoryOptimizer(object):
def get_total_optimized_mem_size(self): def get_total_optimized_mem_size(self):
optimized_mem_size = 0 optimized_mem_size = 0
for mem in self.mem_block: for mem in self.mem_block:
print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block print mem, MemoryTypeToStr(self.mem_block[mem].mem_type), \
self.mem_block[mem].block
optimized_mem_size += self.mem_size(self.mem_block[mem]) optimized_mem_size += self.mem_size(self.mem_block[mem])
return optimized_mem_size return optimized_mem_size
...@@ -165,7 +179,7 @@ class MemoryOptimizer(object): ...@@ -165,7 +179,7 @@ class MemoryOptimizer(object):
self.idle_mem.remove(mem_id) self.idle_mem.remove(mem_id)
if mem_id == -1: if mem_id == -1:
mem_id = self.mem_id_base() + self.total_mem_count mem_id = self.total_mem_count
self.total_mem_count += 1 self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_block self.mem_block[mem_id] = op_mem_block
...@@ -198,9 +212,6 @@ class MemoryOptimizer(object): ...@@ -198,9 +212,6 @@ class MemoryOptimizer(object):
self.get_total_origin_mem_size(), self.get_total_origin_mem_size(),
self.get_total_optimized_mem_size())) self.get_total_optimized_mem_size()))
def mem_id_base(self):
return 0
class GPUMemoryOptimizer(MemoryOptimizer): class GPUMemoryOptimizer(MemoryOptimizer):
def op_need_optimize_memory(self, op): def op_need_optimize_memory(self, op):
...@@ -256,6 +267,7 @@ class GPUMemoryOptimizer(MemoryOptimizer): ...@@ -256,6 +267,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
arena = self.net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
block.mem_id = mem block.mem_id = mem
block.device_type = DeviceType.GPU.value
block.mem_type = self.mem_block[mem].mem_type block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0] block.x = self.mem_block[mem].block[0]
block.y = self.mem_block[mem].block[1] block.y = self.mem_block[mem].block[1]
...@@ -279,9 +291,6 @@ class GPUMemoryOptimizer(MemoryOptimizer): ...@@ -279,9 +291,6 @@ class GPUMemoryOptimizer(MemoryOptimizer):
net_ocl_max_img_size_arg.ints[:] = [max_image_size_x, net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
max_image_size_y] max_image_size_y]
def mem_id_base(self):
return 20000
def optimize_gpu_memory(net_def): def optimize_gpu_memory(net_def):
mem_optimizer = GPUMemoryOptimizer(net_def) mem_optimizer = GPUMemoryOptimizer(net_def)
......
...@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) { ...@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block(); mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}}); mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_device_type(static_cast<DeviceType>({{net.mem_arena.mem_block[i].device_type}}));
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}})); mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}}); mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}}); mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
......
...@@ -224,8 +224,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -224,8 +224,7 @@ std::map<std::string, int> AddMemoryOptimization(
const std::vector<std::vector<int64_t>> &output_shapes, const std::vector<std::vector<int64_t>> &output_shapes,
NetDef *net_def) { NetDef *net_def) {
std::map<std::string, int> res; std::map<std::string, int> res;
// TODO(liuqi) refactor based on PB int mem_id = 0;
int mem_id = 20000;
size_t input_shape_size = input_shapes.size(); size_t input_shape_size = input_shapes.size();
uint32_t in_mem_block_x = 0; uint32_t in_mem_block_x = 0;
uint32_t in_mem_block_y = 0; uint32_t in_mem_block_y = 0;
...@@ -244,6 +243,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -244,6 +243,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) { for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y); mem_blk_ptr->set_y(in_mem_block_y);
...@@ -264,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -264,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) { for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y); mem_blk_ptr->set_y(out_mem_block_y);
......
...@@ -225,8 +225,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -225,8 +225,7 @@ std::map<std::string, int> AddMemoryOptimization(
const std::vector<std::vector<int64_t>> &output_shapes, const std::vector<std::vector<int64_t>> &output_shapes,
NetDef *net_def) { NetDef *net_def) {
std::map<std::string, int> res; std::map<std::string, int> res;
// TODO(liuqi) refactor based on PB int mem_id = 0;
int mem_id = 20000;
size_t input_shape_size = input_shapes.size(); size_t input_shape_size = input_shapes.size();
uint32_t in_mem_block_x = 0; uint32_t in_mem_block_x = 0;
uint32_t in_mem_block_y = 0; uint32_t in_mem_block_y = 0;
...@@ -245,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -245,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) { for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y); mem_blk_ptr->set_y(in_mem_block_y);
...@@ -265,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -265,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) { for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y); mem_blk_ptr->set_y(out_mem_block_y);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册