提交 440fd4c7 编写于 作者: 李寅

Merge branch 'fix-ws-bug' into 'master'

Add device_type to MemoryBlock and fix workspace preallocation bug.

See merge request !694
......@@ -41,7 +41,7 @@ SerialNet::SerialNet(
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx);
// TODO(liuqi): refactor based on PB
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
operator_def, "device", static_cast<int>(device_type_));
......
......@@ -185,7 +185,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// As DSP may have different data output type for each op,
// we stick to the same concept.
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor based on PB
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
......@@ -204,33 +204,38 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::CPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image());
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::GPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
if (mem_block.device_type() == device_type) {
VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
<< ", device type: " << mem_block.device_type()
<< ", memory type: " << mem_block.mem_type();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::CPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image());
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::GPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
}
}
VLOG(3) << "Preallocate buffer to tensors";
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor based on PB
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
......
......@@ -79,9 +79,10 @@ message OperatorDef {
// for memory optimization
message MemoryBlock {
optional int32 mem_id = 1;
optional MemoryType mem_type = 2;
optional uint32 x = 3;
optional uint32 y = 4;
optional int32 device_type = 2;
optional MemoryType mem_type = 3;
optional uint32 x = 4;
optional uint32 y = 5;
}
message MemoryArena {
repeated MemoryBlock mem_block = 1;
......
......@@ -17,10 +17,22 @@ import operator
from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter as cvt
from mace.python.tools.converter_tool.base_converter import DeviceType
from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import OpenCLBufferType
def MemoryTypeToStr(mem_type):
if mem_type == mace_pb2.CPU_BUFFER:
return 'CPU_BUFFER'
elif mem_type == mace_pb2.GPU_BUFFER:
return 'GPU_BUFFER'
elif mem_type == mace_pb2.GPU_IMAGE:
return 'GPU_IMAGE'
else:
return 'UNKNOWN'
class MemoryBlock(object):
def __init__(self, mem_type, block):
self._mem_type = mem_type
......@@ -88,6 +100,7 @@ class MemoryOptimizer(object):
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.device_type = DeviceType.CPU.value
block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0]
block.y = 1
......@@ -103,7 +116,8 @@ class MemoryOptimizer(object):
def get_total_optimized_mem_size(self):
optimized_mem_size = 0
for mem in self.mem_block:
print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block
print mem, MemoryTypeToStr(self.mem_block[mem].mem_type), \
self.mem_block[mem].block
optimized_mem_size += self.mem_size(self.mem_block[mem])
return optimized_mem_size
......@@ -165,7 +179,7 @@ class MemoryOptimizer(object):
self.idle_mem.remove(mem_id)
if mem_id == -1:
mem_id = self.mem_id_base() + self.total_mem_count
mem_id = self.total_mem_count
self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_block
......@@ -198,9 +212,6 @@ class MemoryOptimizer(object):
self.get_total_origin_mem_size(),
self.get_total_optimized_mem_size()))
def mem_id_base(self):
return 0
class GPUMemoryOptimizer(MemoryOptimizer):
def op_need_optimize_memory(self, op):
......@@ -256,6 +267,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.device_type = DeviceType.GPU.value
block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0]
block.y = self.mem_block[mem].block[1]
......@@ -279,9 +291,6 @@ class GPUMemoryOptimizer(MemoryOptimizer):
net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
max_image_size_y]
def mem_id_base(self):
return 20000
def optimize_gpu_memory(net_def):
mem_optimizer = GPUMemoryOptimizer(net_def)
......
......@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_device_type(static_cast<DeviceType>({{net.mem_arena.mem_block[i].device_type}}));
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
......
......@@ -224,8 +224,7 @@ std::map<std::string, int> AddMemoryOptimization(
const std::vector<std::vector<int64_t>> &output_shapes,
NetDef *net_def) {
std::map<std::string, int> res;
// TODO(liuqi) refactor based on PB
int mem_id = 20000;
int mem_id = 0;
size_t input_shape_size = input_shapes.size();
uint32_t in_mem_block_x = 0;
uint32_t in_mem_block_y = 0;
......@@ -244,6 +243,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y);
......@@ -264,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y);
......
......@@ -225,8 +225,7 @@ std::map<std::string, int> AddMemoryOptimization(
const std::vector<std::vector<int64_t>> &output_shapes,
NetDef *net_def) {
std::map<std::string, int> res;
// TODO(liuqi) refactor based on PB
int mem_id = 20000;
int mem_id = 0;
size_t input_shape_size = input_shapes.size();
uint32_t in_mem_block_x = 0;
uint32_t in_mem_block_y = 0;
......@@ -245,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y);
......@@ -265,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_device_type(DeviceType::GPU);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册