提交 d24d459e 编写于 作者: L liuqi

merge memory_optimizer code to tf_converter_lib.

上级 66a68689
import sys
import operator
from mace.proto import mace_pb2
class MemoryOptimizer(object):
def __init__(self, net_def):
self.net_def = net_def
self.idle_mem = set()
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0
self.ref_counter = {}
consumers = {}
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
# only ref op's output tensor
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
tensor_name = self._op_to_tensor(op)
if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name])
else:
self.ref_counter[tensor_name] = 0
def _op_to_tensor(self, op):
return op.name + ':0'
def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def optimize(self):
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
if len(self.idle_mem) == 0:
# allocate new mem
mem_id = self.total_mem_count
self.total_mem_count += 1
else:
# reuse mem
mem_id = self.idle_mem.pop()
op.mem_id = mem_id
self.op_mem[self._op_to_tensor(op)] = mem_id
if mem_id not in self.mem_block:
self.mem_block[mem_id] = [0, 0]
mem_size = self.mem_block[mem_id]
mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1])
mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4)
# de-ref input tensor mem
for ipt in op.input:
if ipt in self.ref_counter:
self.ref_counter[ipt] -= 1
if self.ref_counter[ipt] == 0:
self.idle_mem.add(self.op_mem[ipt])
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
if __name__ == '__main__':
model_file = sys.argv[1]
opt_model_file = sys.argv[2]
with open(model_file, "rb") as f:
net_def = mace_pb2.NetDef()
net_def.ParseFromString(f.read())
optimizer = MemoryOptimizer(net_def)
optimizer.optimize()
with open(opt_model_file, "wb") as f:
f.write(net_def.SerializeToString())
with open(opt_model_file + '_txt', "wb") as f:
net_def.ClearField('tensors')
f.write(str(net_def))
import operator
import sys
from mace.proto import mace_pb2 from mace.proto import mace_pb2
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
...@@ -44,13 +46,18 @@ class TFConverter(object): ...@@ -44,13 +46,18 @@ class TFConverter(object):
self.tf_graph = {} self.tf_graph = {}
self.resolved_ops = {} self.resolved_ops = {}
self.idle_mem = set()
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0
self.ref_counter = {}
for op in tf_ops: for op in tf_ops:
self.resolved_ops[op.name] = 0 self.resolved_ops[op.name] = 0
for input in op.inputs: for input in op.inputs:
input_name = input.name[:-2] input_name = input.name[:-2]
if input_name not in self.tf_graph: if input_name not in self.tf_graph:
self.tf_graph[input_name] = [] self.tf_graph[input_name] = []
print input_name
self.tf_graph[input_name].append(op) self.tf_graph[input_name].append(op)
def add_buffer_to_image(self, input_name, input_type): def add_buffer_to_image(self, input_name, input_type):
...@@ -104,7 +111,7 @@ class TFConverter(object): ...@@ -104,7 +111,7 @@ class TFConverter(object):
def add_output_shape(outputs, op): def add_output_shape(outputs, op):
output_shapes = [] output_shapes = []
for output in outputs: for output in outputs:
if output.shape is not None and not output.shape: if output.shape.num_elements() is not None:
output_shape = mace_pb2.OutputShape() output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list()) output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape) output_shapes.append(output_shape)
...@@ -209,12 +216,21 @@ class TFConverter(object): ...@@ -209,12 +216,21 @@ class TFConverter(object):
def convert_batchnorm(self, op): def convert_batchnorm(self, op):
bn_ops = [] bn_ops = []
bn_ops.append(op) bn_ops.append(op)
for i in range(1, 7): for i in range(1, 3):
if len(self.tf_graph[bn_ops[i-1].name]) == 1 \ if len(self.tf_graph[bn_ops[i-1].name]) == 1 \
and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]: and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
bn_ops.append(self.tf_graph[bn_ops[i-1].name][0]) bn_ops.append(self.tf_graph[bn_ops[i-1].name][0])
else: else:
raise Exception('Invalid BatchNorm Op') raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 \
and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \
and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef() op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add() arg = op_def.arg.add()
...@@ -246,7 +262,7 @@ class TFConverter(object): ...@@ -246,7 +262,7 @@ class TFConverter(object):
data_format_arg.s = 'NHWC' data_format_arg.s = 'NHWC'
self.net_def.op.extend([op_def]) self.net_def.op.extend([op_def])
for i in range(1, 7): for i in range(0, 7):
self.resolved_ops[bn_ops[i].name] = 1 self.resolved_ops[bn_ops[i].name] = 1
def convert_pooling(self, op): def convert_pooling(self, op):
...@@ -408,6 +424,83 @@ class TFConverter(object): ...@@ -408,6 +424,83 @@ class TFConverter(object):
if self.resolved_ops[key] != 1: if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key print 'Unresolve Op: %s' % key
@staticmethod
def _op_to_tensor(op):
return op.name + ':0'
@staticmethod
def is_buffer_image_op(op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def optimize(self):
consumers = {}
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
# only ref op's output tensor
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
tensor_name = self._op_to_tensor(op)
if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name])
else:
self.ref_counter[tensor_name] = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
if not op.output_shape:
print "Op %s don't have output shape information, No way to optimize memory." % op.name
return
if len(self.idle_mem) == 0:
# allocate new mem
mem_id = self.total_mem_count
self.total_mem_count += 1
else:
# reuse mem
mem_id = self.idle_mem.pop()
op.mem_id = mem_id
self.op_mem[self._op_to_tensor(op)] = mem_id
if mem_id not in self.mem_block:
self.mem_block[mem_id] = [0, 0]
mem_size = self.mem_block[mem_id]
mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1])
mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4)
# de-ref input tensor mem
for ipt in op.input:
if ipt in self.ref_counter:
self.ref_counter[ipt] -= 1
if self.ref_counter[ipt] == 0:
self.idle_mem.add(self.op_mem[ipt])
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device): def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device):
net_def = mace_pb2.NetDef() net_def = mace_pb2.NetDef()
dt = data_type_map[data_type] dt = data_type_map[data_type]
...@@ -418,7 +511,8 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi ...@@ -418,7 +511,8 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi
ops = graph.get_operations() ops = graph.get_operations()
converter = TFConverter(ops, net_def, dt, device) converter = TFConverter(ops, net_def, dt, device)
converter.convert(input_node, output_node) converter.convert(input_node, output_node)
print "PB Converted, start optimize memory."
print "PB Parsed." converter.optimize()
print "Memory optimization done."
return net_def return net_def
...@@ -13,7 +13,6 @@ fi ...@@ -13,7 +13,6 @@ fi
TF_MODEL_FILE_PATH=$1 TF_MODEL_FILE_PATH=$1
MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
MACE_MODEL_NAME='mace_model.pb' MACE_MODEL_NAME='mace_model.pb'
MACE_OPT_MODEL_NAME='mace_opt_model.pb'
INPUT_FILE_NAME='model_input' INPUT_FILE_NAME='model_input'
OUTPUT_FILE_NAME='gcn.out' OUTPUT_FILE_NAME='gcn.out'
OUTPUT_LIST_FILE='gcn.list' OUTPUT_LIST_FILE='gcn.list'
...@@ -36,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ ...@@ -36,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output_node=GCN/br_result_2/fcn_br \ --output_node=GCN/br_result_2/fcn_br \
--data_type=DT_HALF \ --data_type=DT_HALF \
--runtime=gpu --runtime=gpu
bazel build mace/python/tools:memory_optimizer
bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \
${MODEL_DIR}/${MACE_OPT_MODEL_NAME}
# Step 3: Run model on the phone # Step 3: Run model on the phone
echo "Step 3: Run model on the phone" echo "Step 3: Run model on the phone"
...@@ -51,7 +46,7 @@ bazel build -c opt --strip always mace/examples:mace_run \ ...@@ -51,7 +46,7 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell "mkdir -p ${PHONE_DATA_DIR}" adb shell "mkdir -p ${PHONE_DATA_DIR}"
adb shell "mkdir -p ${KERNEL_DIR}" adb shell "mkdir -p ${KERNEL_DIR}"
adb push mace/kernels/opencl/cl/* ${KERNEL_DIR} adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
...@@ -62,7 +57,7 @@ adb </dev/null shell MACE_CPP_MIN_VLOG_LEVEL=0 \ ...@@ -62,7 +57,7 @@ adb </dev/null shell MACE_CPP_MIN_VLOG_LEVEL=0 \
MACE_KERNEL_PATH=$KERNEL_DIR \ MACE_KERNEL_PATH=$KERNEL_DIR \
OMP_NUM_THREADS=$num_threads \ OMP_NUM_THREADS=$num_threads \
${PHONE_DATA_DIR}/mace_run \ ${PHONE_DATA_DIR}/mace_run \
--model=${PHONE_DATA_DIR}/${MACE_OPT_MODEL_NAME} \ --model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
--input=mace_input_node \ --input=mace_input_node \
--output=mace_output_node \ --output=mace_output_node \
--input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\ --input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册