diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc index 21eade9c8deba8ddd441b193e990e68793ac8f7a..8ca9765b99a73475fe1884278baac7995358b1a7 100644 --- a/mace/examples/mace_run.cc +++ b/mace/examples/mace_run.cc @@ -129,15 +129,21 @@ int main(int argc, char **argv) { // save output const Tensor *output = ws.GetTensor(output_node + ":0"); - Tensor::MappingGuard output_guard(output); - ofstream out_file(output_file, ios::binary); - out_file.write((const char *)(output->data()), - output->size() * sizeof(float)); - out_file.flush(); - out_file.close(); - VLOG(0) << "Output shape: [" - << output->dim(0) << ", " - << output->dim(1) << ", " - << output->dim(2) << ", " - << output->dim(3) << "]"; + std::remove(output_file.c_str()); + if (output != nullptr) { + Tensor::MappingGuard output_guard(output); + ofstream out_file(output_file, ios::binary); + out_file.write((const char *)(output->data()), + output->size() * sizeof(float)); + out_file.flush(); + out_file.close(); + stringstream ss; + ss << "Output shape: ["; + for (int i = 0; i < output->dim_size(); ++i) { + ss << output->dim(i) << ", "; + + } + ss << "]"; + VLOG(0) << ss.str(); + } } \ No newline at end of file diff --git a/mace/kernels/opencl/fused_conv_2d_opencl.cc b/mace/kernels/opencl/fused_conv_2d_opencl.cc index 8e75cb9d7e369b57eb9caa0125a42f6d8b539c50..86aa0424f003c4c5815766bbab9dac2e6f5ee191 100644 --- a/mace/kernels/opencl/fused_conv_2d_opencl.cc +++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc @@ -28,6 +28,11 @@ extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, const int *padding, const DataType dt, Tensor *output); +extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output); + template void FusedConv2dFunctor::operator()(const Tensor *input, const Tensor *filter, @@ -44,20 +49,15 @@ void FusedConv2dFunctor::operator()(const Tensor *input, {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, {nullptr, nullptr}, {nullptr, nullptr}}; - index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); - if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || - selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + if (!input->is_image() || strides_[0] != strides_[1] || + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) { LOG(WARNING) << "OpenCL conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," << " stride " << strides_[0] << "x" << strides_[1] << " is not implemented yet, using slow version"; - // TODO(heliangliang) The CPU/NEON kernel should map the buffer - FusedConv2dFunctor(strides_, paddings_, dilations_)( - input, filter, bias, output); - return; + MACE_NOT_IMPLEMENTED; } std::vector output_shape(4); @@ -66,16 +66,17 @@ void FusedConv2dFunctor::operator()(const Tensor *input, input->shape().data(), filter->shape().data(), dilations_, strides_, paddings_, output_shape.data(), paddings.data()); - if (input->is_image()) { - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); - output->ResizeImage(output_shape, output_image_shape); + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + + if (kernel_h == kernel_w && kernel_h <= 5 && + selector[kernel_h - 1][strides_[0] - 1] != nullptr) { + auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; + conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum::value, output); } else { - output->Resize(output_shape); + Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(), DataTypeToEnum::value, output); } - - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum::value, output); } template diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc index 896fbbc6ae700ce99968414c052c1ae07119c49c..7ce58e6ce18b34f5c2c4f8b97de3ff2cb3f0e508 100644 --- a/mace/ops/fused_conv_2d_test.cc +++ b/mace/ops/fused_conv_2d_test.cc @@ -408,3 +408,81 @@ TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) { TestHalfComplexConvNxNS12({32, 32, 32, 64}); } +template +static void TestGeneralConvNxNS12(const std::vector &image_shape, + const std::vector &filter_shape) { + testing::internal::LogToStderr(); + auto func = [&](int stride_h, int stride_w, Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 1; + index_t height = image_shape[0]; + index_t width = image_shape[1]; + index_t input_channels = filter_shape[2]; + index_t output_channels = filter_shape[3]; + index_t kernel_h = filter_shape[0]; + index_t kernel_w = filter_shape[1]; + // Construct graph + OpsTestNet net; + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}); + net.AddRandomInput("Bias", {output_channels}); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); + }; + + for (int stride : {1, 2}) { + func(stride, stride, VALID); + func(stride, stride, SAME); + } +} + +TEST_F(FusedConv2dOpTest, OPENCL7X7ConvNxNS12) { + TestGeneralConvNxNS12({32, 32}, + {7, 7, 3, 64}); +} + +TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) { + TestGeneralConvNxNS12({40, 40}, + {15, 1, 32, 64}); +} + diff --git a/mace/python/tools/tf_converter.py b/mace/python/tools/tf_converter.py index 8f1cbf66dfa317118227c68c61939124bf005de5..886999d3f59bfb2f49f5db8bf598c8b462f64b17 100644 --- a/mace/python/tools/tf_converter.py +++ b/mace/python/tools/tf_converter.py @@ -24,7 +24,7 @@ def main(unused_args): input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize) else: output_graph_def = tf_converter_lib.convert_to_mace_pb( - input_graph_def, FLAGS.runtime) + input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.data_type, FLAGS.runtime) with gfile.GFile(FLAGS.output, "wb") as f: f.write(output_graph_def.SerializeToString()) @@ -67,6 +67,11 @@ def parse_args(): type=bool, default=False, help="e.g., False") + parser.add_argument( + "--data_type", + type=str, + default='DT_FLOAT', + help="e.g., DT_HALF/DT_FLOAT") return parser.parse_known_args() diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 39ec8a6eb189f95a200d65a5af6383c16f4ee994..80b5ee4253a8de05a82435ec5f7593734b757115 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -19,6 +19,11 @@ buffer_type_map = { 'ARGUMENT' : 2, } +data_type_map = { + 'DT_HALF' : mace_pb2.DT_HALF, + 'DT_FLOAT': mace_pb2.DT_FLOAT +} + def convert_tensor(op, tensor): tf_tensor = op.outputs[0].eval() tensor.name = op.outputs[0].name @@ -42,21 +47,70 @@ def get_input_tensor(op, index): input_tensor = get_input_tensor(input_tensor.op, 0) return input_tensor -def add_buffer_to_image(input_name, input_type, net_def): +def add_buffer_to_image(input_name, input_type, dt, net_def): output_name = input_name[:-2] + "_b2i" + input_name[-2:] op_def = net_def.op.add() - op_def.name = output_name + op_def.name = output_name[:-2] op_def.type = 'BufferToImage' op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'mode' + arg.i = 0 + arg = op_def.arg.add() + arg.name = 'T' + arg.i = dt + return output_name + +def add_image_to_buffer(input_name, input_type, dt, net_def): + output_name = input_name[:-2] + "_i2b" + input_name[-2:] + op_def = net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'T' + arg.i = dt + return output_name + +def add_input_transform(name, dt, net_def): + new_input_name = "mace_input_node:0" + op_def = net_def.op.add() + op_def.name = name + op_def.type = 'BufferToImage' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + epsilon_arg = op_def.arg.add() epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map[input_type] + epsilon_arg.i = buffer_type_map['IN_OUT'] + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = dt + +def add_output_transform(name, net_def): + output_name = "mace_output_node:0" + op_def = net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'mode' - epsilon_arg.i = 0 - return output_name + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT'] -def convert_ops(unresolved_ops, net_def, device): +def convert_ops(unresolved_ops, dt, net_def, device): ops_count = len(unresolved_ops) resolved_count = 1 @@ -67,233 +121,235 @@ def convert_ops(unresolved_ops, net_def, device): elif first_op.type == 'Const': tensor = net_def.tensors.add() convert_tensor(first_op, tensor) - elif first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative': + else: op_def = net_def.op.add() - op_def.name = first_op.name - if first_op.type == 'DepthwiseConv2dNative': - op_def.type = 'DepthwiseConv2d' - else: - op_def.type = first_op.type - if device == 'gpu': - op_def.input.extend([first_op.inputs[0].name]) - output_name = add_buffer_to_image(first_op.inputs[1].name, "FILTER", net_def) - op_def.input.extend([output_name]) - else: - op_def.input.extend([input.name for input in first_op.inputs]) + arg = op_def.arg.add() + arg.name = 'T' + arg.i = dt - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[first_op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' - final_op = first_op - - if ops_count >= 3 and unresolved_ops[1].type == 'Const' and unresolved_ops[2].type == 'BiasAdd' : - bias_tensor = unresolved_ops[1] - tensor = net_def.tensors.add() - convert_tensor(bias_tensor, tensor) - - bias_add_op = unresolved_ops[2] + if first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative': + op_def.name = first_op.name + if first_op.type == 'DepthwiseConv2dNative': + op_def.type = 'DepthwiseConv2d' + else: + op_def.type = first_op.type if device == 'gpu': - output_name = add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT", net_def) + op_def.input.extend([first_op.inputs[0].name]) + output_name = add_buffer_to_image(first_op.inputs[1].name, "FILTER", dt, net_def) op_def.input.extend([output_name]) else: - op_def.input.extend([bias_add_op.inputs[1].name]) - final_op = bias_add_op - resolved_count = 3 - - if ops_count >= 4 and unresolved_ops[3].type == 'Relu': - relu_op = unresolved_ops[3]; - op_def.type = "FusedConv2D" - final_op = relu_op - resolved_count = 4 - - op_def.output.extend([output.name for output in final_op.outputs]) - output_shapes = [] - for output in final_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) + op_def.input.extend([input.name for input in first_op.inputs]) - elif first_op.type == 'FusedBatchNorm': - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = 'BatchNorm' - if device == 'gpu': - op_def.input.extend([first_op.inputs[0].name]) - for i in range(1, len(first_op.inputs)): - output_name = add_buffer_to_image(first_op.inputs[i].name, "ARGUMENT", net_def) - op_def.input.extend([output_name]) - else: - op_def.input.extend([input.name for input in first_op.inputs]) - op_def.output.extend([first_op.outputs[0].name]) - - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(first_op.outputs[0].shape.as_list()) - op_def.output_shape.extend([output_shape]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'epsilon' - epsilon_arg.f = first_op.get_attr('epsilon') - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' - elif first_op.type == 'Add' and first_op.name.endswith( - 'batchnorm/add') and ops_count > 7: - add_op = first_op - mul_op = unresolved_ops[2] - mul_1_op = unresolved_ops[3] - mul_2_op = unresolved_ops[4] - sub_op = unresolved_ops[5] - add_1_op = unresolved_ops[6] - # print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type) - if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \ - mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add': - raise Exception('Invalid BatchNorm Op') - - get_input_tensor(mul_1_op, 0) - input_name = get_input_tensor(mul_1_op, 0).name - gamma = get_input_tensor(mul_op, 1).name - beta = get_input_tensor(sub_op, 0).name - mean = get_input_tensor(mul_2_op, 0).name - variance = get_input_tensor(add_op, 0).name - epsilon = get_input_tensor(add_op, 1).name + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[first_op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + final_op = first_op - op_def = net_def.op.add() - op_def.name = first_op.name[:-4] # remove /add - op_def.type = 'BatchNorm' - op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon]) - op_def.output.extend([output.name for output in add_1_op.outputs]) - output_shapes = [] - for output in add_1_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) + if ops_count >= 3 and unresolved_ops[1].type == 'Const' and unresolved_ops[2].type == 'BiasAdd' : + bias_tensor = unresolved_ops[1] + tensor = net_def.tensors.add() + convert_tensor(bias_tensor, tensor) + + bias_add_op = unresolved_ops[2] + if device == 'gpu': + output_name = add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT", dt, net_def) + op_def.input.extend([output_name]) + else: + op_def.input.extend([bias_add_op.inputs[1].name]) + final_op = bias_add_op + resolved_count = 3 + + if ops_count >= 4 and unresolved_ops[3].type == 'Relu': + relu_op = unresolved_ops[3]; + op_def.type = "FusedConv2D" + final_op = relu_op + resolved_count = 4 + + op_def.output.extend([output.name for output in final_op.outputs]) + output_shapes = [] + for output in final_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + + elif first_op.type == 'FusedBatchNorm': + op_def.name = first_op.name + op_def.type = 'BatchNorm' + if device == 'gpu': + op_def.input.extend([first_op.inputs[0].name]) + for i in range(1, len(first_op.inputs)): + output_name = add_buffer_to_image(first_op.inputs[i].name, "ARGUMENT", dt, net_def) + op_def.input.extend([output_name]) + else: + op_def.input.extend([input.name for input in first_op.inputs]) + op_def.output.extend([first_op.outputs[0].name]) - resolved_count = 7 - elif first_op.type == 'Relu6': - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = 'Relu' - op_def.input.extend([input.name for input in first_op.inputs]) - op_def.output.extend([output.name for output in first_op.outputs]) - output_shapes = [] - for output in first_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool': - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = 'Pooling' - op_def.input.extend([input.name for input in first_op.inputs]) - op_def.output.extend([output.name for output in first_op.outputs]) - output_shapes = [] - for output in first_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode[first_op.type] - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[first_op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) - kernels_arg = op_def.arg.add() - kernels_arg.name = 'kernels' - kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' - elif first_op.type == 'Add': - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = "AddN" - op_def.input.extend([input.name for input in first_op.inputs]) - op_def.output.extend([output.name for output in first_op.outputs]) - output_shapes = [] - for output in first_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - elif first_op.type == 'ConcatV2': - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = "Concat" - op_def.input.extend([first_op.inputs[i].name for i in xrange(2)]) - op_def.output.extend([output.name for output in first_op.outputs]) - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis_arg.i = get_input_tensor(first_op, 2).eval().astype(np.int32) - output_shapes = [] - for output in first_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - elif first_op.type == 'ResizeBilinear': - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = "ResizeBilinear" - op_def.input.extend([first_op.inputs[0].name]) - op_def.output.extend([output.name for output in first_op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'size' - size_arg.ints.extend(get_input_tensor(first_op, 1).eval().astype(np.int32).flat) - size_arg = op_def.arg.add() - size_arg.name = 'align_corners' - size_arg.ints.extend(first_op.get_attr('align_corners')) - output_shapes = [] - for output in first_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - elif first_op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND', 'BiasAdd']: - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = first_op.type - op_def.input.extend([input.name for input in first_op.inputs]) - op_def.output.extend([output.name for output in first_op.outputs]) - output_shapes = [] - for output in first_op.outputs: output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - else: - raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type)) - pass + output_shape.dims.extend(first_op.outputs[0].shape.as_list()) + op_def.output_shape.extend([output_shape]) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'epsilon' + epsilon_arg.f = first_op.get_attr('epsilon') + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + elif first_op.type == 'Add' and first_op.name.endswith( + 'batchnorm/add') and ops_count > 7: + add_op = first_op + mul_op = unresolved_ops[2] + mul_1_op = unresolved_ops[3] + mul_2_op = unresolved_ops[4] + sub_op = unresolved_ops[5] + add_1_op = unresolved_ops[6] + # print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type) + if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \ + mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add': + raise Exception('Invalid BatchNorm Op') + + get_input_tensor(mul_1_op, 0) + input_name = get_input_tensor(mul_1_op, 0).name + gamma = get_input_tensor(mul_op, 1).name + beta = get_input_tensor(sub_op, 0).name + mean = get_input_tensor(mul_2_op, 0).name + variance = get_input_tensor(add_op, 0).name + epsilon = get_input_tensor(add_op, 1).name + + op_def.name = first_op.name[:-4] # remove /add + op_def.type = 'BatchNorm' + op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon]) + op_def.output.extend([output.name for output in add_1_op.outputs]) + output_shapes = [] + for output in add_1_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + + resolved_count = 7 + elif first_op.type == 'Relu6': + op_def.name = first_op.name + op_def.type = 'Relu' + op_def.input.extend([input.name for input in first_op.inputs]) + op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool': + op_def.name = first_op.name + op_def.type = 'Pooling' + op_def.input.extend([input.name for input in first_op.inputs]) + op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode[first_op.type] + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[first_op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) + kernels_arg = op_def.arg.add() + kernels_arg.name = 'kernels' + kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + elif first_op.type == 'Add': + op_def.name = first_op.name + op_def.type = "AddN" + op_def.input.extend([input.name for input in first_op.inputs]) + op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + elif first_op.type == 'ConcatV2': + op_def.name = first_op.name + op_def.type = "Concat" + op_def.input.extend([first_op.inputs[i].name for i in xrange(2)]) + op_def.output.extend([output.name for output in first_op.outputs]) + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = get_input_tensor(first_op, 2).eval().astype(np.int32) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + elif first_op.type == 'ResizeBilinear': + op_def.name = first_op.name + op_def.type = "ResizeBilinear" + op_def.input.extend([first_op.inputs[0].name]) + op_def.output.extend([output.name for output in first_op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'size' + size_arg.ints.extend(get_input_tensor(first_op, 1).eval().astype(np.int32).flat) + size_arg = op_def.arg.add() + size_arg.name = 'align_corners' + size_arg.i = first_op.get_attr('align_corners') + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + elif first_op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND', 'BiasAdd']: + op_def.name = first_op.name + op_def.type = first_op.type + op_def.input.extend([input.name for input in first_op.inputs]) + op_def.output.extend([output.name for output in first_op.outputs]) + output_shapes = [] + for output in first_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + else: + raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type)) + pass for i in range(resolved_count): del unresolved_ops[0] -def convert_to_mace_pb(input_graph_def, device): +def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device): net_def = mace_pb2.NetDef() + dt = data_type_map[data_type] with tf.Session() as session: with session.graph.as_default() as graph: tf.import_graph_def(input_graph_def, name="") ops = graph.get_operations() unresolved_ops = ops + if device == 'gpu': + add_input_transform(input_node, dt, net_def) while len(unresolved_ops) > 0: - convert_ops(unresolved_ops, net_def, device) + convert_ops(unresolved_ops, dt, net_def, device) + if device == 'gpu': + add_output_transform(output_node, net_def) print "PB Parsed." diff --git a/tools/validate_icnet.py b/tools/validate.py similarity index 67% rename from tools/validate_icnet.py rename to tools/validate.py index 319011e7241dfb8007f47f959bb531a1b6302c59..42a856b02b8b67f758c1d62cd1fcc122145a3738 100644 --- a/tools/validate_icnet.py +++ b/tools/validate.py @@ -1,5 +1,7 @@ import argparse import sys +import os +import os.path import tensorflow as tf import numpy as np @@ -13,28 +15,35 @@ from tensorflow import gfile # 3. adb pull the result. # 4. Compare output data of mace and tf # python validate_icnet.py --model_file opt_icnet.pb \ -# --tf_input_file input_file \ +# --input_file input_file \ # --mace_out_file icnet.out def generate_data(shape): np.random.seed(FLAGS.random_seed) data = np.random.random(shape) - print FLAGS.tf_input_file - data.astype(np.float32).tofile(FLAGS.tf_input_file) - mace_data = np.transpose(data, axes=(2, 0, 1)) - mace_data.astype(np.float32).tofile(FLAGS.mace_input_file) + print FLAGS.input_file + data.astype(np.float32).tofile(FLAGS.input_file) print "Generate input file done." def load_data(file): - return np.fromfile(file=file, dtype=np.float32) + if os.path.isfile(file): + return np.fromfile(file=file, dtype=np.float32) + else: + return np.empty([0]) def valid_output(out_shape, mace_out_file, tf_out_value): mace_out_value = load_data(mace_out_file) - mace_out_value = mace_out_value.reshape(out_shape) - tf_out_data_t = np.transpose(tf_out_value, axes=(0, 3, 1, 2)) - res = np.allclose(mace_out_value, tf_out_data_t, rtol=0, atol=1e-5) - print 'Passed! Haha' if res else 'Failed! Oops' + if mace_out_value.size != 0: + mace_out_value = mace_out_value.reshape(out_shape) + np.testing.assert_allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3) + res = np.allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3) + if res: + print '=======================Passed! Haha======================' + else: + print '=======================Failed! Oops======================' + else: + print '=======================Skip empty node===================' def run_model(input_shape): @@ -51,13 +60,14 @@ def run_model(input_shape): with tf.Session() as session: with session.graph.as_default() as graph: tf.import_graph_def(input_graph_def, name="") - input_node = graph.get_tensor_by_name('input_node:0') - output_node = graph.get_tensor_by_name('output_node:0') + input_node = graph.get_tensor_by_name(FLAGS.input_node + ':0') + output_node = graph.get_tensor_by_name(FLAGS.output_node + ':0') - input_value = load_data(FLAGS.tf_input_file) + input_value = load_data(FLAGS.input_file) input_value = input_value.reshape(input_shape) output_value = session.run(output_node, feed_dict={input_node: [input_value]}) + # output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weight') return output_value def main(unused_args): @@ -80,15 +90,10 @@ def parse_args(): default="", help="TensorFlow \'GraphDef\' file to load.") parser.add_argument( - "--tf_input_file", - type=str, - default="", - help="tensorflow input data to load.") - parser.add_argument( - "--mace_input_file", + "--input_file", type=str, default="", - help="mace input data to load.") + help="input file.") parser.add_argument( "--mace_out_file", type=str, @@ -97,13 +102,23 @@ def parse_args(): parser.add_argument( "--input_shape", type=str, - default="480,480,3", + default="512,512,3", help="input shape.") parser.add_argument( "--output_shape", type=str, - default="1,2,480,480", + default="1,512,512,2", help="output shape.") + parser.add_argument( + "--input_node", + type=str, + default="input_node", + help="input node") + parser.add_argument( + "--output_node", + type=str, + default="output_node", + help="output node") parser.add_argument( "--generate_data", type='bool', diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh new file mode 100644 index 0000000000000000000000000000000000000000..f4dfc6ebc33dc5fd37ff6efc85aef386a88f0253 --- /dev/null +++ b/tools/validate_gcn.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Must run at root dir of mace project. + +Usage() { + echo 'Usage: bash tools/validate_gcn.sh tf_model_file' +} + +if [ $# != 1 ];then + Usage + exit -1 +fi + +TF_MODEL_FILE_PATH=$1 +MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) +MACE_MODEL_NAME='mace_model.pb' +INPUT_FILE_NAME='model_input' +OUTPUT_FILE_NAME='gcn.out' +OUTPUT_LIST_FILE='gcn.list' +PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}" +KERNEL_DIR="${PHONE_DATA_DIR}/cl/" + +# Step 1: Generate input data +echo "Step 1: Generate input data" +python tools/validate.py --generate_data true --random_seed 1 \ + --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \ + --input_shape=512,512,3 + +# Step 2: convert tf model to mace model +echo "Step 2: convert tf model to mace model" +bazel build //mace/python/tools:tf_converter +bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ + --output=${MODEL_DIR}/${MACE_MODEL_NAME} \ + --input_node=input \ + --output_node=GCN/br_result_2/fcn_br \ + --data_type=DT_FLOAT \ + --runtime=gpu + + +# Step 3: Run model on the phone +echo "Step 3: Run model on the phone" +bazel build -c opt --strip always mace/examples:mace_run \ + --crosstool_top=//external:android/crosstool \ + --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ + --cpu=arm64-v8a + +adb shell "mkdir -p ${PHONE_DATA_DIR}" +adb shell "mkdir -p ${KERNEL_DIR}" +adb push mace/kernels/opencl/cl/* ${KERNEL_DIR} +adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR} +adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} +adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} + +num_threads=${1:-1} + +adb