提交 1d924255 编写于 作者: Y yejianwu

Merge branch 'master' of v9.git.n.xiaomi.com:deep-learning/mace into gen_opencl_kernel_binary

......@@ -91,8 +91,13 @@ class Operator : public OperatorBase {
}
for (const string &output_str : operator_def.output()) {
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, GetDeviceAllocator(D), DataTypeToEnum<T>::v())));
if (ws->HasTensor(output_str)) {
Tensor *found_tensor = ws->GetTensor(output_str);
outputs_.push_back(ws->GetTensor(output_str));
} else {
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, GetDeviceAllocator(D), DataTypeToEnum<T>::v())));
}
}
}
virtual bool Run() override = 0;
......
......@@ -199,14 +199,20 @@ class Tensor {
size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
if (is_image_ && !image_shape_.empty()) {
MACE_ASSERT(image_shape_.size() == 2
&& image_shape_[0] >= image_shape[0]
|| image_shape_[1] >= image_shape[1],
"image shape not large enough");
}
if (!is_image_ && buffer_ != nullptr) {
alloc_->Delete(buffer_);
}
is_image_ = true;
image_shape_ = image_shape;
buffer_ = alloc_->NewImage(image_shape, dtype_);
if (image_shape_.empty()) {
image_shape_ = image_shape;
buffer_ = alloc_->NewImage(image_shape, dtype_);
}
}
}
......@@ -226,6 +232,17 @@ class Tensor {
}
}
inline void AllocateImageMemory(const std::vector<size_t> &image_shape) {
is_image_ = true;
if (image_shape_ != image_shape) {
if (buffer_ != nullptr) {
alloc_->DeleteImage(buffer_);
}
image_shape_ = image_shape;
buffer_ = alloc_->NewImage(image_shape, dtype_);
}
}
template <typename T>
inline void Copy(const T *src, index_t size) {
MACE_CHECK(size == size_, "copy src and dst with different size.");
......
......@@ -3,8 +3,8 @@
//
#include "mace/core/workspace.h"
#include "mace/core/common.h"
#include "mace/core/serializer.h"
#include "mace/core/proto_utils.h"
namespace mace {
......@@ -63,6 +63,34 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
tensor_map_[tensor_proto.name()] =
serializer.Deserialize(tensor_proto, type);
}
if (type == DeviceType::OPENCL) {
CreateImageOutputTensor(net_def);
}
}
void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
return;
}
std::map<std::string, std::shared_ptr<Tensor>> mem_tensor_map;
const DataType dtype = static_cast<DataType>(
ArgumentHelper::GetSingleArgument<OperatorDef, int>(
net_def.op(0),
"T",
static_cast<int>(DT_FLOAT)));
for (auto &mem_block: net_def.mem_arena().mem_block()) {
string mem_block_name = MemBlockName(mem_block.mem_id());
mem_tensor_map[mem_block_name].reset(new Tensor(
GetDeviceAllocator(DeviceType::OPENCL),
dtype));
mem_tensor_map[mem_block_name]->AllocateImageMemory({mem_block.x(),
mem_block.y()});
}
for (auto &op: net_def.op()) {
if (op.has_mem_id()) {
tensor_map_[op.output(0)] = mem_tensor_map[MemBlockName(op.mem_id())];
}
}
}
} // namespace mace
\ No newline at end of file
......@@ -13,7 +13,7 @@ namespace mace {
class Workspace {
public:
typedef map<string, unique_ptr<Tensor>> TensorMap;
typedef map<string, std::shared_ptr<Tensor>> TensorMap;
Workspace() {}
......@@ -33,7 +33,13 @@ class Workspace {
void LoadModelTensor(const NetDef &net_def, DeviceType type);
inline std::string MemBlockName(int mem_id) const {
return internal::MakeString("mem_block_", mem_id);
};
private:
void CreateImageOutputTensor(const NetDef &net_def);
TensorMap tensor_map_;
DISABLE_COPY_AND_ASSIGN(Workspace);
......
......@@ -101,9 +101,12 @@ int main(int argc, char **argv) {
}
// Init model
VLOG(0) << "Run init";
auto net = CreateNet(net_def, &ws, device_type, NetMode::INIT);
net->Run();
VLOG(0) << "Run model";
// run model
net = CreateNet(net_def, &ws, device_type);
......
......@@ -83,6 +83,7 @@ message OperatorDef {
optional string type = 4;
repeated Argument arg = 5;
repeated OutputShape output_shape = 6;
repeated DataType output_type = 7;
// Memory optimization: only support one single output op
optional int32 mem_id = 10 [default = -1];
......@@ -128,6 +129,9 @@ message NetDef {
repeated Argument arg = 4;
repeated TensorProto tensors = 5;
// for mem optimization
optional MemoryArena mem_arena = 10;
// for hexagon mace-nnlib
repeated InputInfo input_info = 100;
repeated OutputInfo output_info = 101;
......
py_library(
name = "tf_converter_lib",
srcs = [
"convert_util.py",
"graph_util.py",
"tf_converter_lib.py",
"tf_dsp_converter_lib.py",
"graph_util.py"],
],
srcs_version = "PY2AND3",
deps = [
"//mace/proto:mace_py",
......@@ -20,6 +22,15 @@ py_binary(
],
)
py_binary(
name = "memory_optimizer",
srcs = ["memory_optimizer.py"],
srcs_version = "PY2AND3",
deps = [
"//mace/proto:mace_py",
],
)
py_binary(
name = "tf_ops_stats",
srcs = ["tf_ops_stats.py"],
......
import tensorflow as tf
from mace.proto import mace_pb2
TF_DTYPE_2_MACE_DTYPE_MAP = {
tf.float32: mace_pb2.DT_FLOAT,
tf.double: mace_pb2.DT_DOUBLE,
tf.half: mace_pb2.DT_HALF,
tf.int64: mace_pb2.DT_INT64,
tf.int32: mace_pb2.DT_INT32,
tf.qint32: mace_pb2.DT_INT32,
tf.int16: mace_pb2.DT_INT16,
tf.qint16: mace_pb2.DT_INT16,
tf.int8: mace_pb2.DT_INT8,
tf.qint8: mace_pb2.DT_INT8,
tf.quint16: mace_pb2.DT_UINT16,
tf.uint16: mace_pb2.DT_UINT16,
tf.quint8: mace_pb2.DT_UINT8,
tf.uint8: mace_pb2.DT_UINT8,
tf.string: mace_pb2.DT_STRING,
tf.bool: mace_pb2.DT_BOOL,
}
def tf_dtype_2_mace_dtype(tf_dtype):
mace_dtype = TF_DTYPE_2_MACE_DTYPE_MAP.get(tf_dtype, None)
if not mace_dtype:
raise Exception("Not supported tensorflow dtype: " + tf_dtype)
return mace_dtype
import sys
import operator
from mace.proto import mace_pb2
class MemoryOptimizer(object):
def __init__(self, net_def):
self.net_def = net_def
self.idle_mem = set()
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0
self.ref_counter = {}
consumers = {}
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
# only ref op's output tensor
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
tensor_name = self._op_to_tensor(op)
if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name])
else:
self.ref_counter[tensor_name] = 0
def _op_to_tensor(self, op):
return op.name + ':0'
def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def optimize(self):
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
if len(self.idle_mem) == 0:
# allocate new mem
mem_id = self.total_mem_count
self.total_mem_count += 1
else:
# reuse mem
mem_id = self.idle_mem.pop()
op.mem_id = mem_id
self.op_mem[self._op_to_tensor(op)] = mem_id
if mem_id not in self.mem_block:
self.mem_block[mem_id] = [0, 0]
mem_size = self.mem_block[mem_id]
mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1])
mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4)
# de-ref input tensor mem
for ipt in op.input:
if ipt in self.ref_counter:
self.ref_counter[ipt] -= 1
if self.ref_counter[ipt] == 0:
self.idle_mem.add(self.op_mem[ipt])
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
if __name__ == '__main__':
model_file = sys.argv[1]
opt_model_file = sys.argv[2]
with open(model_file, "rb") as f:
net_def = mace_pb2.NetDef()
net_def.ParseFromString(f.read())
optimizer = MemoryOptimizer(net_def)
optimizer.optimize()
with open(opt_model_file, "wb") as f:
f.write(net_def.SerializeToString())
with open(opt_model_file + '_txt', "wb") as f:
net_def.ClearField('tensors')
f.write(str(net_def))
from mace.proto import mace_pb2
import tensorflow as tf
import numpy as np
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
# TODO: support NCHW formt, now only support NHWC.
padding_mode = {
......@@ -110,6 +111,19 @@ def add_output_transform(name, net_def):
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT']
def convert_op_outputs(mace_op_def, tf_op):
mace_op_def.output.extend([output.name for output in tf_op.outputs])
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
for output in tf_op.outputs])
output_shapes = []
for output in tf_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes)
def convert_ops(unresolved_ops, dt, net_def, device):
ops_count = len(unresolved_ops)
resolved_count = 1
......@@ -171,13 +185,7 @@ def convert_ops(unresolved_ops, dt, net_def, device):
final_op = relu_op
resolved_count = 4
op_def.output.extend([output.name for output in final_op.outputs])
output_shapes = []
for output in final_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, final_op)
elif first_op.type == 'FusedBatchNorm':
op_def.name = first_op.name
......@@ -225,26 +233,15 @@ def convert_ops(unresolved_ops, dt, net_def, device):
op_def.name = first_op.name[:-4] # remove /add
op_def.type = 'BatchNorm'
op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon])
op_def.output.extend([output.name for output in add_1_op.outputs])
output_shapes = []
for output in add_1_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, add_1_op)
resolved_count = 7
elif first_op.type == 'Relu6':
op_def.name = first_op.name
op_def.type = 'Relu'
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, first_op)
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
......@@ -252,13 +249,8 @@ def convert_ops(unresolved_ops, dt, net_def, device):
op_def.name = first_op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, first_op)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[first_op.type]
......@@ -278,55 +270,31 @@ def convert_ops(unresolved_ops, dt, net_def, device):
op_def.name = first_op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, first_op)
elif first_op.type == 'ConcatV2':
op_def.name = first_op.name
op_def.type = "Concat"
op_def.input.extend([first_op.inputs[i].name for i in xrange(2)])
op_def.output.extend([output.name for output in first_op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = get_input_tensor(first_op, 2).eval().astype(np.int32)
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, first_op)
elif first_op.type == 'ResizeBilinear':
op_def.name = first_op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([first_op.inputs[0].name])
op_def.output.extend([output.name for output in first_op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(get_input_tensor(first_op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = first_op.get_attr('align_corners')
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, first_op)
elif first_op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND', 'BiasAdd']:
op_def.name = first_op.name
op_def.type = first_op.type
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([output.name for output in first_op.outputs])
output_shapes = []
for output in first_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
convert_op_outputs(op_def, first_op)
else:
raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type))
pass
......
......@@ -3,6 +3,7 @@ import tensorflow as tf
from operator import mul
from dsp_ops import DspOps
from mace.python.tools import graph_util
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
# converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
# --runtime dsp --input_node input_node --output_node output_node
......@@ -65,6 +66,18 @@ def add_shape_const_node(net_def, op, values, name):
tensor.dims.extend(values)
return tensor.name
def convert_op_outputs(mace_op_def, tf_op):
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
for output in tf_op.outputs])
output_shapes = []
for output in tf_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes)
def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
first_op = unresolved_ops[0]
print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
......@@ -120,6 +133,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
op_def.input.extend([min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
elif has_padding_and_strides(first_op):
op_def.padding = padding_mode[first_op.get_attr('padding')]
op_def.input.extend([t.name for t in first_op.inputs])
......@@ -131,13 +145,15 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
op_def.input.extend([strides_tensor])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
else:
raise Exception('Unsupported op: ', first_op)
......@@ -311,6 +327,10 @@ def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
new_input_op.padding = input_op.padding
new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
new_ops.append(new_input_op)
new_input_op.output_shape.extend([input_op.output_shape[0],
minf_op.output_shape[0],
maxf_op.output_shape[0]])
new_input_op.output_type.extend([input_op.output_type[0], mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
......
#!/bin/bash
# Must run at root dir of mace project.
set +x
Usage() {
echo 'Usage: bash tools/validate_gcn.sh tf_model_file'
}
......@@ -13,6 +13,7 @@ fi
TF_MODEL_FILE_PATH=$1
MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
MACE_MODEL_NAME='mace_model.pb'
MACE_OPT_MODEL_NAME='mace_opt_model.pb'
INPUT_FILE_NAME='model_input'
OUTPUT_FILE_NAME='gcn.out'
OUTPUT_LIST_FILE='gcn.list'
......@@ -26,14 +27,17 @@ python tools/validate.py --generate_data true --random_seed 1 \
--input_shape=512,512,3
# Step 2: convert tf model to mace model
echo "Step 2: convert tf model to mace model"
echo "Step 2: convert tf model to mace model and optimize memory"
bazel build //mace/python/tools:tf_converter
bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output=${MODEL_DIR}/${MACE_MODEL_NAME} \
--input_node=input \
--output_node=GCN/br_result_2/fcn_br \
--data_type=DT_HALF\
--data_type=DT_HALF \
--runtime=gpu
bazel build mace/python/tools:memory_optimizer
bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \
${MODEL_DIR}/${MACE_OPT_MODEL_NAME}
# Step 3: Run model on the phone
......@@ -46,7 +50,7 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell "mkdir -p ${PHONE_DATA_DIR}"
adb shell "mkdir -p ${KERNEL_DIR}"
adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
......@@ -56,13 +60,14 @@ adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \
OMP_NUM_THREADS=$num_threads \
${PHONE_DATA_DIR}/mace_run \
--model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
--model=${PHONE_DATA_DIR}/${MACE_OPT_MODEL_NAME} \
--input=mace_input_node \
--output=mace_output_node \
--input_shape=1,512,512,3\
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
--output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
--device=OPENCL
--device=OPENCL \
--round=1
# Step 4: pull the mace run result.
echo "Step 4: pull the mace run result."
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册