diff --git a/mace/dsp/hexagon_control_wrapper.cc b/mace/dsp/hexagon_control_wrapper.cc index 08ad17b52eab45b905079ce8dd7f647617d33d6a..3f25a5d78d208d3d10abbd18ae6234ae2033d2ed 100644 --- a/mace/dsp/hexagon_control_wrapper.cc +++ b/mace/dsp/hexagon_control_wrapper.cc @@ -228,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() { hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME); } +bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) { + LOG(INFO) << "Execute graph: " << nn_id_; + // single input and single output + MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num"); + MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num"); + output_tensor->SetDtype(output_data_types_[0]); + output_tensor->Resize(output_shapes_[0]); + vector output_shape(4); + uint32_t output_bytes; + int res = hexagon_nn_execute(nn_id_, + input_tensor.shape()[0], + input_tensor.shape()[1], + input_tensor.shape()[2], + input_tensor.shape()[3], + reinterpret_cast( + input_tensor.raw_data()), + input_tensor.raw_size(), + &output_shape[0], + &output_shape[1], + &output_shape[2], + &output_shape[3], + reinterpret_cast( + output_tensor->raw_mutable_data()), + output_tensor->raw_size(), + &output_bytes); + + MACE_ASSERT(output_shape == output_shapes_[0], + "wrong output shape inferred"); + MACE_ASSERT(output_bytes == output_tensor->raw_size(), + "wrong output bytes inferred."); + return res == 0; +}; + +bool HexagonControlWrapper::ExecuteGraphNew(const vector &input_tensors, + vector *output_tensors) { + LOG(INFO) << "Execute graph new: " << nn_id_; + int num_inputs = input_tensors.size(); + int num_outputs = output_tensors->size(); + MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num"); + MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num"); + + hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs]; + hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs]; + + for (int i = 0; i < num_inputs; ++i) { + vector input_shape = input_tensors[i].shape(); + inputs[i].batches = input_shape[0]; + inputs[i].height = input_shape[1]; + inputs[i].width = input_shape[2]; + inputs[i].depth = input_shape[3]; + inputs[i].data = const_cast( + reinterpret_cast(input_tensors[i].raw_data())); + inputs[i].dataLen = input_tensors[i].raw_size(); + inputs[i].data_valid_len = input_tensors[i].raw_size(); + inputs[i].unused = 0; + } + + for (int i = 0; i < num_outputs; ++i) { + (*output_tensors)[i].SetDtype(output_data_types_[i]); + (*output_tensors)[i].Resize(output_shapes_[i]); + outputs[i].data = reinterpret_cast( + (*output_tensors)[i].raw_mutable_data()); + outputs[i].dataLen = (*output_tensors)[i].raw_size(); + } + + int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs, + outputs, num_outputs); + + for (int i = 0; i < num_outputs; ++i) { + vector output_shape {outputs[i].batches, outputs[i].height, + outputs[i].width, outputs[i].depth}; + MACE_ASSERT(output_shape == output_shapes_[i], + "wrong output shape inferred"); + MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(), + "wrong output bytes inferred."); + } + + delete [] inputs; + delete [] outputs; + return res == 0; +}; + +bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor, + Tensor *output_tensor) { + vector input_tensors(3); + vector output_tensors(3); + input_tensors[0].SetDtype(DT_UINT8); + output_tensors[0].SetDtype(DT_UINT8); + input_tensors[0].ResizeLike(input_tensor); + input_tensors[1].Resize({1, 1, 1, 1}); + float *min_in_data = input_tensors[1].mutable_data(); + input_tensors[2].Resize({1, 1, 1, 1}); + float *max_in_data = input_tensors[2].mutable_data(); + quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data); + if (!ExecuteGraphNew(input_tensors, &output_tensors)) { + return false; + } + + output_tensor->ResizeLike(output_tensors[0]); + + const float *min_out_data = output_tensors[1].data(); + const float *max_out_data = output_tensors[2].data(); + quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor); + return true; +} + } // namespace mace \ No newline at end of file diff --git a/mace/dsp/hexagon_control_wrapper.h b/mace/dsp/hexagon_control_wrapper.h index 3fe1b3a7053ecca97f4391ab0c88dee93b53de07..fa9f47b1bc7fbbb08e58b70ab15f5cb8e884f847 100644 --- a/mace/dsp/hexagon_control_wrapper.h +++ b/mace/dsp/hexagon_control_wrapper.h @@ -7,6 +7,7 @@ #include "mace/dsp/hexagon/hexagon_controller.h" #include "mace/dsp/hexagon_nn_ops.h" +#include "mace/dsp/util/quantize.h" #include "mace/core/common.h" #include "mace/core/tensor.h" #include "mace/proto/mace.pb.h" @@ -23,83 +24,10 @@ class HexagonControlWrapper { bool Finalize(); bool SetupGraph(const NetDef& net_def); bool SetupGraph(const std::string &model_file); - bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) { - LOG(INFO) << "Execute graph: " << nn_id_; - // single input and single output - MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num"); - MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num"); - output_tensor->SetDtype(output_data_types_[0]); - output_tensor->Resize(output_shapes_[0]); - vector output_shape(4); - uint32_t output_bytes; - int res = hexagon_nn_execute(nn_id_, - input_tensor.shape()[0], - input_tensor.shape()[1], - input_tensor.shape()[2], - input_tensor.shape()[3], - reinterpret_cast( - input_tensor.raw_data()), - input_tensor.raw_size(), - &output_shape[0], - &output_shape[1], - &output_shape[2], - &output_shape[3], - reinterpret_cast( - output_tensor->raw_mutable_data()), - output_tensor->raw_size(), - &output_bytes); - - MACE_ASSERT(output_shape == output_shapes_[0], - "wrong output shape inferred"); - MACE_ASSERT(output_bytes == output_tensor->raw_size(), - "wrong output bytes inferred."); - return res == 0; - }; - - bool ExecuteGraphNew(const Tensor *input_tensors, int num_inputs, - Tensor *output_tensors, int num_outputs) { - LOG(INFO) << "Execute graph new: " << nn_id_; - MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num"); - MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num"); - - hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs]; - hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs]; - - for (int i = 0; i < num_inputs; ++i) { - vector input_shape = input_tensors[i].shape(); - inputs[i].batches = input_shape[0]; - inputs[i].height = input_shape[1]; - inputs[i].width = input_shape[2]; - inputs[i].depth = input_shape[3]; - inputs[i].data = const_cast( - reinterpret_cast(input_tensors[i].raw_data())); - inputs[i].dataLen = input_tensors[i].raw_size(); - inputs[i].data_valid_len = input_tensors[i].raw_size(); - inputs[i].unused = 0; - } - - for (int i = 0; i < num_outputs; ++i) { - output_tensors[i].SetDtype(output_data_types_[i]); - output_tensors[i].Resize(output_shapes_[i]); - vector output_shape = output_tensors[0].shape(); - outputs[i].batches = output_shape[0]; - outputs[i].height = output_shape[1]; - outputs[i].width = output_shape[2]; - outputs[i].depth = output_shape[3]; - outputs[i].data = reinterpret_cast( - output_tensors[i].raw_mutable_data()); - outputs[i].dataLen = output_tensors[i].raw_size(); - outputs[i].data_valid_len = output_tensors[i].raw_size(); - outputs[i].unused = 0; - } - - int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs, - outputs, num_outputs); - - delete [] inputs; - delete [] outputs; - return res == 0; - }; + bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor); + bool ExecuteGraphNew(const vector& input_tensors, + vector *output_tensors); + bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor); bool TeardownGraph(); void PrintLog(); @@ -118,6 +46,7 @@ class HexagonControlWrapper { int nn_id_; Serializer serializer_; + Quantizer quantizer_; vector> input_shapes_; vector> output_shapes_; diff --git a/mace/dsp/hexagon_control_wrapper_test.cc b/mace/dsp/hexagon_control_wrapper_test.cc index 48a743c69ecdb09bb09ca95412fe8852a86a55eb..b34e028c16b80fdfe9c280a3edf353fa9e040ec6 100644 --- a/mace/dsp/hexagon_control_wrapper_test.cc +++ b/mace/dsp/hexagon_control_wrapper_test.cc @@ -8,7 +8,7 @@ using namespace mace; -TEST(HexagonControlerWrapper, GetVersion) { +TEST(HexagonControlerWrapper, InputFloat) { testing::internal::LogToStderr(); HexagonControlWrapper wrapper; VLOG(0) << "version: " << wrapper.GetVersion(); @@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) { wrapper.ResetPerfInfo(); timeval tv1, tv2; gettimeofday(&tv1, NULL); - int round = 2; + int round = 10; for (int i = 0; i < round; ++i) { VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); } @@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) { } std::cout << std::endl; + VLOG(0) << wrapper.TeardownGraph(); + wrapper.Finalize(); +} + +TEST(HexagonControlerWrapper, PreQuantize) { + testing::internal::LogToStderr(); + HexagonControlWrapper wrapper; + VLOG(0) << "version: " << wrapper.GetVersion(); + wrapper.Init(); + wrapper.SetDebugLevel(0); + wrapper.Config(); + VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb"); + wrapper.PrintGraph(); + + Tensor input_tensor; + Tensor output_tensor; + input_tensor.Resize({1, 480, 480, 3}); + float *input_data = input_tensor.mutable_data(); + for (int i = 0; i < input_tensor.size(); ++i) { + input_data[i] = i % 256; + } + + wrapper.ResetPerfInfo(); + timeval tv1, tv2; + gettimeofday(&tv1, NULL); + int round = 10; + for (int i = 0; i < round; ++i) { + VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor); + } + gettimeofday(&tv2, NULL); + VLOG(0) << "avg duration: " + << ((tv2.tv_sec - tv1.tv_sec) * 1000 + + (tv2.tv_usec - tv1.tv_usec) / 1000) / + round; + + wrapper.GetPerfInfo(); + wrapper.PrintLog(); + + const float *output_data = output_tensor.data(); + for (int i = 0; i < output_tensor.size(); ++i) { + std::cout << output_data[i] << " "; + } + std::cout << std::endl; + VLOG(0) << wrapper.TeardownGraph(); wrapper.Finalize(); } \ No newline at end of file diff --git a/mace/python/tools/tf_converter.py b/mace/python/tools/tf_converter.py index fbf19f5b7cf8d705683a959bece07067ac43a5f9..d30a463ca2bce938d716e799f82049308e044586 100644 --- a/mace/python/tools/tf_converter.py +++ b/mace/python/tools/tf_converter.py @@ -21,7 +21,7 @@ def main(unused_args): if FLAGS.runtime == 'dsp': output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( - input_graph_def, FLAGS.input_node, FLAGS.output_node) + input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize) else: output_graph_def = tf_converter_lib.convert_to_mace_pb( input_graph_def) @@ -62,6 +62,11 @@ def parse_args(): type=str, default="softmax", help="e.g., softmax") + parser.add_argument( + "--prequantize", + type=bool, + default=False, + help="e.g., False") return parser.parse_known_args() diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py index 8f925059279d2b50b13fc28aaf1aca975ec67bc7..ced16ce853e8f49b9c968e09ed257a8e3bf815b5 100644 --- a/mace/python/tools/tf_dsp_converter_lib.py +++ b/mace/python/tools/tf_dsp_converter_lib.py @@ -5,7 +5,7 @@ from dsp_ops import DspOps from mace.python.tools import graph_util # converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \ -# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node +# --runtime dsp --input_node input_node --output_node output_node padding_mode = { 'NA': 0, @@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def): for follow_op in follow_ops: new_follow_op = mace_pb2.OperatorDef() new_follow_op.CopyFrom(follow_op) - for i in range(len(follow_op.input)): - for k in range(3): + for i in xrange(len(follow_op.input)): + for k in xrange(3): if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k) new_ops.append(new_follow_op) @@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def): new_net_def = mace_pb2.NetDef() new_net_def.tensors.extend(tensor_map.values()) - for op in net_def.op: - if op.name not in skip_ops: - new_net_def.op.extend([op]) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) new_net_def.op.extend(new_ops) return new_net_def @@ -249,29 +247,101 @@ def add_node_id(net_def): return net_def -def add_input_output_info(net_def, input_node, output_node, graph): +def add_input_output_info(net_def, input_node, output_node, graph, dtype): input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) - for op in net_def.op: - if op.name == input_node: + input_info = net_def.input_info.add() + input_info.dims.extend(input_tensor.shape.as_list()) + input_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): input_info = net_def.input_info.add() - input_info.name = op.name - input_info.node_id = op.node_id - input_info.dims.extend(input_tensor.shape.as_list()) - input_info.max_byte_size = max_elem_size(input_tensor) - input_info.data_type = find_dtype(input_tensor.dtype) - elif op.name == output_node: + input_info.dims.extend([1,1,1,1]) + input_info.data_type = mace_pb2.DT_FLOAT + + output_info = net_def.output_info.add() + output_info.dims.extend(output_tensor.shape.as_list()) + output_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): output_info = net_def.output_info.add() - output_info.name = op.name - output_info.node_id = op.node_id - output_info.dims.extend(output_tensor.shape.as_list()) - output_info.max_byte_size = max_elem_size(output_tensor) - output_info.data_type = find_dtype(output_tensor.dtype) + output_info.dims.extend([1,1,1,1]) + output_info.data_type = mace_pb2.DT_FLOAT return net_def -def convert_to_mace_pb(input_graph_def, input_node, output_node): +def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node): + tensor_map = {} + for tensor in net_def.tensors: + tensor_map[tensor.name] = tensor + op_map = {} + for op in net_def.op: + op_map[op.name] = op + consumers = {} + for op in net_def.op: + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + + skip_ops = set() + new_ops = [] + skip_tensors = set() + + # INPUT->Flatten->Minf, Maxf->Quantize + for op in net_def.op: + if op.type == 'INPUT': + input_op = op + flatten_op = None + quantize_op = None + for o in consumers[get_tensor_name_from_op(input_op.name, 0)]: + if o.type == 'Flatten': + flatten_op = o + elif o.type == 'Quantize': + quantize_op = o + if quantize_op is not None: + minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)] + skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name]) + skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) + + new_input_op = mace_pb2.OperatorDef() + new_input_op.name = input_op.name + new_input_op.type = input_op.type + new_input_op.padding = input_op.padding + new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4]) + new_ops.append(new_input_op) + for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]: + new_follow_op = mace_pb2.OperatorDef() + new_follow_op.CopyFrom(follow_op) + for i in xrange(len(follow_op.input)): + for k in xrange(3): + if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k): + new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k) + new_ops.append(new_follow_op) + skip_ops.add(follow_op.name) + + elif op.type == 'OUTPUT': + output_op = op + dequantize_op = get_node_from_map(op_map, output_op.input[0]) + if dequantize_op.type == 'Dequantize': + skip_ops = skip_ops.union([dequantize_op.name, output_op.name]) + + new_output_op = mace_pb2.OperatorDef() + new_output_op.name = output_op.name + new_output_op.type = output_op.type + new_output_op.input.extend(dequantize_op.input) + new_ops.append(new_output_op) + + + + new_net_def = mace_pb2.NetDef() + new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors]) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) + new_net_def.op.extend(new_ops) + return new_net_def + +def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False): """ nnlib does not have batch norm, so use tensorflow optimizer to fold batch norm with convolution. The fold optimization reorders ops, so @@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node): add_output_node(net_def, output_node) # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def) + + if prequantize: + net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node) + sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') net_def_with_node_id = add_node_id(sorted_net_def) - final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph) + if prequantize: + dtype = mace_pb2.DT_UINT8 + else: + dtype = mace_pb2.DT_FLOAT + final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype) return final_net_def