提交 d2e4da63 编写于 作者: Y Yin Li

Pre quantize data on cpu

上级 902f320c
...@@ -228,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() { ...@@ -228,4 +228,111 @@ void HexagonControlWrapper::ResetPerfInfo() {
hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME); hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME);
} }
bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
Tensor *output_tensor) {
LOG(INFO) << "Execute graph: " << nn_id_;
// single input and single output
MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
output_tensor->SetDtype(output_data_types_[0]);
output_tensor->Resize(output_shapes_[0]);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
vector<Tensor> *output_tensors) {
LOG(INFO) << "Execute graph new: " << nn_id_;
int num_inputs = input_tensors.size();
int num_outputs = output_tensors->size();
MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
for (int i = 0; i < num_inputs; ++i) {
vector<index_t> input_shape = input_tensors[i].shape();
inputs[i].batches = input_shape[0];
inputs[i].height = input_shape[1];
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
}
for (int i = 0; i < num_outputs; ++i) {
(*output_tensors)[i].SetDtype(output_data_types_[i]);
(*output_tensors)[i].Resize(output_shapes_[i]);
outputs[i].data = reinterpret_cast<unsigned char *>(
(*output_tensors)[i].raw_mutable_data());
outputs[i].dataLen = (*output_tensors)[i].raw_size();
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
for (int i = 0; i < num_outputs; ++i) {
vector<uint32_t> output_shape {outputs[i].batches, outputs[i].height,
outputs[i].width, outputs[i].depth};
MACE_ASSERT(output_shape == output_shapes_[i],
"wrong output shape inferred");
MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
"wrong output bytes inferred.");
}
delete [] inputs;
delete [] outputs;
return res == 0;
};
bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
Tensor *output_tensor) {
vector<Tensor> input_tensors(3);
vector<Tensor> output_tensors(3);
input_tensors[0].SetDtype(DT_UINT8);
output_tensors[0].SetDtype(DT_UINT8);
input_tensors[0].ResizeLike(input_tensor);
input_tensors[1].Resize({1, 1, 1, 1});
float *min_in_data = input_tensors[1].mutable_data<float>();
input_tensors[2].Resize({1, 1, 1, 1});
float *max_in_data = input_tensors[2].mutable_data<float>();
quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data, max_in_data);
if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
return false;
}
output_tensor->ResizeLike(output_tensors[0]);
const float *min_out_data = output_tensors[1].data<float>();
const float *max_out_data = output_tensors[2].data<float>();
quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data, output_tensor);
return true;
}
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "mace/dsp/hexagon/hexagon_controller.h" #include "mace/dsp/hexagon/hexagon_controller.h"
#include "mace/dsp/hexagon_nn_ops.h" #include "mace/dsp/hexagon_nn_ops.h"
#include "mace/dsp/util/quantize.h"
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
...@@ -23,83 +24,10 @@ class HexagonControlWrapper { ...@@ -23,83 +24,10 @@ class HexagonControlWrapper {
bool Finalize(); bool Finalize();
bool SetupGraph(const NetDef& net_def); bool SetupGraph(const NetDef& net_def);
bool SetupGraph(const std::string &model_file); bool SetupGraph(const std::string &model_file);
bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor) { bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
LOG(INFO) << "Execute graph: " << nn_id_; bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
// single input and single output vector<Tensor> *output_tensors);
MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num"); bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
output_tensor->SetDtype(output_data_types_[0]);
output_tensor->Resize(output_shapes_[0]);
vector<uint32_t> output_shape(4);
uint32_t output_bytes;
int res = hexagon_nn_execute(nn_id_,
input_tensor.shape()[0],
input_tensor.shape()[1],
input_tensor.shape()[2],
input_tensor.shape()[3],
reinterpret_cast<const unsigned char *>(
input_tensor.raw_data()),
input_tensor.raw_size(),
&output_shape[0],
&output_shape[1],
&output_shape[2],
&output_shape[3],
reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data()),
output_tensor->raw_size(),
&output_bytes);
MACE_ASSERT(output_shape == output_shapes_[0],
"wrong output shape inferred");
MACE_ASSERT(output_bytes == output_tensor->raw_size(),
"wrong output bytes inferred.");
return res == 0;
};
bool ExecuteGraphNew(const Tensor *input_tensors, int num_inputs,
Tensor *output_tensors, int num_outputs) {
LOG(INFO) << "Execute graph new: " << nn_id_;
MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
hexagon_nn_tensordef *inputs = new hexagon_nn_tensordef[num_inputs];
hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
for (int i = 0; i < num_inputs; ++i) {
vector<index_t> input_shape = input_tensors[i].shape();
inputs[i].batches = input_shape[0];
inputs[i].height = input_shape[1];
inputs[i].width = input_shape[2];
inputs[i].depth = input_shape[3];
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
inputs[i].dataLen = input_tensors[i].raw_size();
inputs[i].data_valid_len = input_tensors[i].raw_size();
inputs[i].unused = 0;
}
for (int i = 0; i < num_outputs; ++i) {
output_tensors[i].SetDtype(output_data_types_[i]);
output_tensors[i].Resize(output_shapes_[i]);
vector<index_t> output_shape = output_tensors[0].shape();
outputs[i].batches = output_shape[0];
outputs[i].height = output_shape[1];
outputs[i].width = output_shape[2];
outputs[i].depth = output_shape[3];
outputs[i].data = reinterpret_cast<unsigned char *>(
output_tensors[i].raw_mutable_data());
outputs[i].dataLen = output_tensors[i].raw_size();
outputs[i].data_valid_len = output_tensors[i].raw_size();
outputs[i].unused = 0;
}
int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
outputs, num_outputs);
delete [] inputs;
delete [] outputs;
return res == 0;
};
bool TeardownGraph(); bool TeardownGraph();
void PrintLog(); void PrintLog();
...@@ -118,6 +46,7 @@ class HexagonControlWrapper { ...@@ -118,6 +46,7 @@ class HexagonControlWrapper {
int nn_id_; int nn_id_;
Serializer serializer_; Serializer serializer_;
Quantizer quantizer_;
vector<vector<index_t>> input_shapes_; vector<vector<index_t>> input_shapes_;
vector<vector<index_t>> output_shapes_; vector<vector<index_t>> output_shapes_;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
using namespace mace; using namespace mace;
TEST(HexagonControlerWrapper, GetVersion) { TEST(HexagonControlerWrapper, InputFloat) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
HexagonControlWrapper wrapper; HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion(); VLOG(0) << "version: " << wrapper.GetVersion();
...@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) { ...@@ -29,7 +29,7 @@ TEST(HexagonControlerWrapper, GetVersion) {
wrapper.ResetPerfInfo(); wrapper.ResetPerfInfo();
timeval tv1, tv2; timeval tv1, tv2;
gettimeofday(&tv1, NULL); gettimeofday(&tv1, NULL);
int round = 2; int round = 10;
for (int i = 0; i < round; ++i) { for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor); VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
} }
...@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) { ...@@ -49,6 +49,50 @@ TEST(HexagonControlerWrapper, GetVersion) {
} }
std::cout << std::endl; std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
TEST(HexagonControlerWrapper, PreQuantize) {
testing::internal::LogToStderr();
HexagonControlWrapper wrapper;
VLOG(0) << "version: " << wrapper.GetVersion();
wrapper.Init();
wrapper.SetDebugLevel(0);
wrapper.Config();
VLOG(0) << wrapper.SetupGraph("quantized_icnet_dsp_u8.pb");
wrapper.PrintGraph();
Tensor input_tensor;
Tensor output_tensor;
input_tensor.Resize({1, 480, 480, 3});
float *input_data = input_tensor.mutable_data<float>();
for (int i = 0; i < input_tensor.size(); ++i) {
input_data[i] = i % 256;
}
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraphPreQuantize(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
VLOG(0) << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round;
wrapper.GetPerfInfo();
wrapper.PrintLog();
const float *output_data = output_tensor.data<float>();
for (int i = 0; i < output_tensor.size(); ++i) {
std::cout << output_data[i] << " ";
}
std::cout << std::endl;
VLOG(0) << wrapper.TeardownGraph(); VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize(); wrapper.Finalize();
} }
\ No newline at end of file
...@@ -21,7 +21,7 @@ def main(unused_args): ...@@ -21,7 +21,7 @@ def main(unused_args):
if FLAGS.runtime == 'dsp': if FLAGS.runtime == 'dsp':
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
input_graph_def, FLAGS.input_node, FLAGS.output_node) input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
else: else:
output_graph_def = tf_converter_lib.convert_to_mace_pb( output_graph_def = tf_converter_lib.convert_to_mace_pb(
input_graph_def) input_graph_def)
...@@ -62,6 +62,11 @@ def parse_args(): ...@@ -62,6 +62,11 @@ def parse_args():
type=str, type=str,
default="softmax", default="softmax",
help="e.g., softmax") help="e.g., softmax")
parser.add_argument(
"--prequantize",
type=bool,
default=False,
help="e.g., False")
return parser.parse_known_args() return parser.parse_known_args()
......
...@@ -5,7 +5,7 @@ from dsp_ops import DspOps ...@@ -5,7 +5,7 @@ from dsp_ops import DspOps
from mace.python.tools import graph_util from mace.python.tools import graph_util
# converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \ # converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
# --runtime dsp --input_dim input_node,1,480,480,3 --output_node icnet/output_node # --runtime dsp --input_node input_node --output_node output_node
padding_mode = { padding_mode = {
'NA': 0, 'NA': 0,
...@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def): ...@@ -208,8 +208,8 @@ def reverse_batch_to_space_and_biasadd(net_def):
for follow_op in follow_ops: for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef() new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op) new_follow_op.CopyFrom(follow_op)
for i in range(len(follow_op.input)): for i in xrange(len(follow_op.input)):
for k in range(3): for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k) new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
new_ops.append(new_follow_op) new_ops.append(new_follow_op)
...@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def): ...@@ -220,9 +220,7 @@ def reverse_batch_to_space_and_biasadd(net_def):
new_net_def = mace_pb2.NetDef() new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values()) new_net_def.tensors.extend(tensor_map.values())
for op in net_def.op: new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
if op.name not in skip_ops:
new_net_def.op.extend([op])
new_net_def.op.extend(new_ops) new_net_def.op.extend(new_ops)
return new_net_def return new_net_def
...@@ -249,29 +247,101 @@ def add_node_id(net_def): ...@@ -249,29 +247,101 @@ def add_node_id(net_def):
return net_def return net_def
def add_input_output_info(net_def, input_node, output_node, graph): def add_input_output_info(net_def, input_node, output_node, graph, dtype):
input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
for op in net_def.op: input_info = net_def.input_info.add()
if op.name == input_node: input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
input_info = net_def.input_info.add() input_info = net_def.input_info.add()
input_info.name = op.name input_info.dims.extend([1,1,1,1])
input_info.node_id = op.node_id input_info.data_type = mace_pb2.DT_FLOAT
input_info.dims.extend(input_tensor.shape.as_list())
input_info.max_byte_size = max_elem_size(input_tensor) output_info = net_def.output_info.add()
input_info.data_type = find_dtype(input_tensor.dtype) output_info.dims.extend(output_tensor.shape.as_list())
elif op.name == output_node: output_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
output_info = net_def.output_info.add() output_info = net_def.output_info.add()
output_info.name = op.name output_info.dims.extend([1,1,1,1])
output_info.node_id = op.node_id output_info.data_type = mace_pb2.DT_FLOAT
output_info.dims.extend(output_tensor.shape.as_list())
output_info.max_byte_size = max_elem_size(output_tensor)
output_info.data_type = find_dtype(output_tensor.dtype)
return net_def return net_def
def convert_to_mace_pb(input_graph_def, input_node, output_node): def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
skip_ops = set()
new_ops = []
skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op:
if op.type == 'INPUT':
input_op = op
flatten_op = None
quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten':
flatten_op = o
elif o.type == 'Quantize':
quantize_op = o
if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
new_input_op = mace_pb2.OperatorDef()
new_input_op.name = input_op.name
new_input_op.type = input_op.type
new_input_op.padding = input_op.padding
new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
new_ops.append(new_input_op)
for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
elif op.type == 'OUTPUT':
output_op = op
dequantize_op = get_node_from_map(op_map, output_op.input[0])
if dequantize_op.type == 'Dequantize':
skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
new_output_op = mace_pb2.OperatorDef()
new_output_op.name = output_op.name
new_output_op.type = output_op.type
new_output_op.input.extend(dequantize_op.input)
new_ops.append(new_output_op)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
""" """
nnlib does not have batch norm, so use tensorflow optimizer to fold nnlib does not have batch norm, so use tensorflow optimizer to fold
batch norm with convolution. The fold optimization reorders ops, so batch norm with convolution. The fold optimization reorders ops, so
...@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node): ...@@ -298,10 +368,18 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node):
add_output_node(net_def, output_node) add_output_node(net_def, output_node)
# optimized_net_def = reverse_batch_to_space_and_biasadd(net_def) # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
if prequantize:
net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
net_def_with_node_id = add_node_id(sorted_net_def) net_def_with_node_id = add_node_id(sorted_net_def)
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph) if prequantize:
dtype = mace_pb2.DT_UINT8
else:
dtype = mace_pb2.DT_FLOAT
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
return final_net_def return final_net_def
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册