提交 d2e4da63 编写于 作者: Y Yin Li

Pre quantize data on cpu

上级 902f320c
......@@ -228,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
}
bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
Tensor *output_tensor) {
LOG(INFO) << "Execute graph: " << nn_id_;
// single input and single output
MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
output_tensor->SetDtype(output_data_types_[0]);
output_tensor->Resize(output_shapes_[0]);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size();
MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
for (int i = 0; i < num_inputs; ++i) {
vector<index_t> input_shape = input_tensors[i].shape();
inputs[i].batches = input_shape[0];
inputs[i].height = input_shape[1];
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
}
for (int i = 0; i < num_outputs; ++i) {
(*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size();
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) {
vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
"wrong output bytes inferred.");
}
delete [] inputs;
delete [] outputs;
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor) {
vector<Tensor> input_tensors(3);
vector<Tensor> output_tensors(3);
input_tensors[0].SetDtype(DT_UINT8);
output_tensors[0].SetDtype(DT_UINT8);
input_tensors[0].ResizeLike(input_tensor);
input_tensors[1].Resize({1, 1, 1, 1});
float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false;
}
output_tensor->ResizeLike(output_tensors[0]);
const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
return true;
}
} // namespace mace
\ No newline at end of file
......@@ -7,6 +7,7 @@
#include "mace/dsp/hexagon/hexagon_controller.h"
#include "mace/dsp/hexagon_nn_ops.h"
#include "mace/dsp/util/quantize.h"
#include "mace/core/common.h"
#include "mace/core/tensor.h"
#include "mace/proto/mace.pb.h"
......@@ -23,83 +24,10 @@ class HexagonControlWrapper {
bool Finalize();
bool SetupGraph(const NetDef& net_def);
bool SetupGraph(const std::string &model_file);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) {
LOG(INFO) << "Execute graph: " << nn_id_;
// single input and single output
MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
output_tensor->SetDtype(output_data_types_[0]);
output_tensor->Resize(output_shapes_[0]);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool ExecuteGraphNew(const Tensor *input_tensors, int num_inputs,
Tensor *output_tensors, int num_outputs) {
LOG(INFO) << "Execute graph new: " << nn_id_;
MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
for (int i = 0; i < num_inputs; ++i) {
vector<index_t> input_shape = input_tensors[i].shape();
inputs[i].batches = input_shape[0];
inputs[i].height = input_shape[1];
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
}
for (int i = 0; i < num_outputs; ++i) {
output_tensors[i].SetDtype(output_data_types_[i]);
output_tensors[i].Resize(output_shapes_[i]);
vector<index_t> output_shape = output_tensors[0].shape();
outputs[i].batches = output_shape[0];
outputs[i].height = output_shape[1];
outputs[i].width = output_shape[2];
outputs[i].depth = output_shape[3];
outputs[i].data = reinterpret_cast<unsigned char *>(
output_tensors[i].raw_mutable_data());
outputs[i].dataLen = output_tensors[i].raw_size();
outputs[i].data_valid_len = output_tensors[i].raw_size();
outputs[i].unused = 0;
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
delete [] inputs;
delete [] outputs;
return res == 0;
};
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
vector<Tensor> *output_tensors);
bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
bool TeardownGraph();
void PrintLog();
......@@ -118,6 +46,7 @@ class HexagonControlWrapper {
int nn_id_;
Serializer serializer_;
Quantizer quantizer_;
vector<vector<index_t>> input_shapes_;
vector<vector<index_t>> output_shapes_;
......
......@@ -8,7 +8,7 @@
using namespace mace;
TEST(HexagonControlerWrapper, GetVersion) {
TEST(HexagonControlerWrapper, InputFloat) {
testing::internal::LogToStderr();
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
......@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int round = 2;
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
}
......@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
}
std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
TEST(HexagonControlerWrapper, PreQuantize) {
testing::internal::LogToStderr();
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
wrapper.Init();
wrapper.SetDebugLevel(0);
wrapper.Config();
VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
wrapper.PrintGraph();
Tensor input_tensor;
Tensor output_tensor;
input_tensor.Resize({1, 480, 480, 3});
float *input_data = input_tensor.mutable_data<float>();
for (int i = 0; i < input_tensor.size(); ++i) {
input_data[i] = i % 256;
}
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
VLOG(0) << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round;
wrapper.GetPerfInfo();
wrapper.PrintLog();
const float *output_data = output_tensor.data<float>();
for (int i = 0; i < output_tensor.size(); ++i) {
std::cout << output_data[i] << " ";
}
std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
\ No newline at end of file
......@@ -21,7 +21,7 @@ def main(unused_args):
if FLAGS.runtime == 'dsp':
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
input_graph_def, FLAGS.input_node, FLAGS.output_node)
input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
else:
output_graph_def = tf_converter_lib.convert_to_mace_pb(
input_graph_def)
......@@ -62,6 +62,11 @@ def parse_args():
type=str,
default="softmax",
help="e.g., softmax")
parser.add_argument(
"--prequantize",
type=bool,
default=False,
help="e.g., False")
return parser.parse_known_args()
......
......@@ -5,7 +5,7 @@ from dsp_ops import DspOps
from mace.python.tools import graph_util
# converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node
# --runtime dsp --input_node input_node --output_node output_node
padding_mode = {
'NA': 0,
......@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def):
for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in range(len(follow_op.input)):
for k in range(3):
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
new_ops.append(new_follow_op)
......@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def):
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values())
for op in net_def.op:
if op.name not in skip_ops:
new_net_def.op.extend([op])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
......@@ -249,29 +247,101 @@ def add_node_id(net_def):
return net_def
def add_input_output_info(net_def, input_node, output_node, graph):
def add_input_output_info(net_def, input_node, output_node, graph, dtype):
input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
for op in net_def.op:
if op.name == input_node:
input_info = net_def.input_info.add()
input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
input_info = net_def.input_info.add()
input_info.name = op.name
input_info.node_id = op.node_id
input_info.dims.extend(input_tensor.shape.as_list())
input_info.max_byte_size = max_elem_size(input_tensor)
input_info.data_type = find_dtype(input_tensor.dtype)
elif op.name == output_node:
input_info.dims.extend([1,1,1,1])
input_info.data_type = mace_pb2.DT_FLOAT
output_info = net_def.output_info.add()
output_info.dims.extend(output_tensor.shape.as_list())
output_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
output_info = net_def.output_info.add()
output_info.name = op.name
output_info.node_id = op.node_id
output_info.dims.extend(output_tensor.shape.as_list())
output_info.max_byte_size = max_elem_size(output_tensor)
output_info.data_type = find_dtype(output_tensor.dtype)
output_info.dims.extend([1,1,1,1])
output_info.data_type = mace_pb2.DT_FLOAT
return net_def
def convert_to_mace_pb(input_graph_def, input_node, output_node):
def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
skip_ops = set()
new_ops = []
skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op:
if op.type == 'INPUT':
input_op = op
flatten_op = None
quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten':
flatten_op = o
elif o.type == 'Quantize':
quantize_op = o
if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
new_input_op = mace_pb2.OperatorDef()
new_input_op.name = input_op.name
new_input_op.type = input_op.type
new_input_op.padding = input_op.padding
new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
new_ops.append(new_input_op)
for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
elif op.type == 'OUTPUT':
output_op = op
dequantize_op = get_node_from_map(op_map, output_op.input[0])
if dequantize_op.type == 'Dequantize':
skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
new_output_op = mace_pb2.OperatorDef()
new_output_op.name = output_op.name
new_output_op.type = output_op.type
new_output_op.input.extend(dequantize_op.input)
new_ops.append(new_output_op)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
"""
nnlib does not have batch norm, so use tensorflow optimizer to fold
batch norm with convolution. The fold optimization reorders ops, so
......@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node):
add_output_node(net_def, output_node)
# optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
if prequantize:
net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
net_def_with_node_id = add_node_id(sorted_net_def)
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph)
if prequantize:
dtype = mace_pb2.DT_UINT8
else:
dtype = mace_pb2.DT_FLOAT
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
return final_net_def
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册