From 345f7980513940a67514458607b96e198ffe4890 Mon Sep 17 00:00:00 2001 From: wuchenghui Date: Wed, 4 Apr 2018 15:38:01 +0800 Subject: [PATCH] fix caffe converter for neon --- mace/kernels/activation.h | 26 ++-- mace/kernels/arm/activation.cc | 4 +- mace/kernels/slice.h | 41 ++++-- mace/ops/activation_test.cc | 22 ++- mace/ops/slice.cc | 5 + mace/ops/slice.h | 6 +- mace/ops/slice_test.cc | 51 ++++--- mace/python/tools/caffe_converter_lib.py | 173 ++++++++++++++++++----- 8 files changed, 243 insertions(+), 85 deletions(-) diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 5130ccdc..c60f4a3b 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -89,18 +89,22 @@ void DoActivation(const T *input_ptr, template void PReLUActivation(const T *input_ptr, - const index_t size, + const index_t outer_size, const index_t input_chan, + const index_t inner_size, const T *alpha_ptr, T *output_ptr) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - const index_t chan_idx = i % input_chan; - T in = input_ptr[i]; - if (in < 0) { - output_ptr[i] = in * alpha_ptr[chan_idx]; - } else { - output_ptr[i] = in; +#pragma omp parallel for collapse(3) + for (index_t i = 0; i < outer_size; ++i) { + for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) { + for (index_t j = 0; j < inner_size; ++j) { + index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j; + if (input_ptr[idx] < 0) { + output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx]; + } else { + output_ptr[idx] = input_ptr[idx]; + } + } } } } @@ -120,7 +124,9 @@ class ActivationFunctor { if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); const T *alpha_ptr = alpha->data(); - PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, + const index_t outer_size = output->dim(0) * output->dim(1) + * output->dim(2); + PReLUActivation(input_ptr, outer_size, input->dim(3), 1, alpha_ptr, output_ptr); } else { DoActivation(input_ptr, output_ptr, output->size(), activation_, diff --git a/mace/kernels/arm/activation.cc b/mace/kernels/arm/activation.cc index 30f21e03..59b94ffa 100644 --- a/mace/kernels/arm/activation.cc +++ b/mace/kernels/arm/activation.cc @@ -17,7 +17,9 @@ void ActivationFunctor::operator()( if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); const float *alpha_ptr = alpha->data(); - PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr, + const index_t outer_size = output->dim(0); + const index_t inner_size = output->dim(2) * output->dim(3); + PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr, output_ptr); } else { DoActivation(input_ptr, output_ptr, output->size(), activation_, diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index 1bde41e5..58c21e7a 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -5,6 +5,7 @@ #ifndef MACE_KERNELS_SLICE_H_ #define MACE_KERNELS_SLICE_H_ +#include #include #include "mace/core/future.h" @@ -16,20 +17,34 @@ namespace mace { namespace kernels { +struct SliceFunctorBase { + explicit SliceFunctorBase(const int32_t axis) : axis_(axis) {} + + int32_t axis_; +}; + template -struct SliceFunctor { +struct SliceFunctor : SliceFunctorBase { + explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {} + void operator()(const Tensor *input, const std::vector &output_list, StatsFuture *future) { - const index_t outer_size = input->dim(0) * input->dim(1) * input->dim(2); - const index_t input_channels = input->dim(3); + const index_t input_channels = input->dim(axis_); const size_t outputs_count = output_list.size(); const index_t output_channels = input_channels / outputs_count; std::vector output_ptrs(output_list.size(), nullptr); + std::vector output_shape(input->shape()); + output_shape[axis_] = output_channels; - std::vector output_shape({input->dim(0), input->dim(1), - input->dim(2), output_channels}); - + const index_t outer_size = std::accumulate(output_shape.begin(), + output_shape.begin() + axis_, + 1, + std::multiplies()); + const index_t inner_size = std::accumulate(output_shape.begin() + axis_ + 1, + output_shape.end(), + 1, + std::multiplies()); for (size_t i= 0; i < outputs_count; ++i) { output_list[i]->Resize(output_shape); output_ptrs[i] = output_list[i]->mutable_data(); @@ -38,25 +53,27 @@ struct SliceFunctor { #pragma omp parallel for for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) { - int input_idx = outer_idx * input_channels; - int output_idx = outer_idx * output_channels; + int input_idx = outer_idx * input_channels * inner_size; + int output_idx = outer_idx * output_channels * inner_size; for (size_t i = 0; i < outputs_count; ++i) { if (DataTypeCanUseMemcpy(DataTypeToEnum::v())) { memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx, - output_channels * sizeof(T)); + output_channels * inner_size * sizeof(T)); } else { - for (index_t k = 0; k < output_channels; ++k) { + for (index_t k = 0; k < output_channels * inner_size; ++k) { *(output_ptrs[i] + output_idx + k) = *(input_ptr + input_idx + k); } } - input_idx += output_channels; + input_idx += output_channels * inner_size; } } } }; template -struct SliceFunctor { +struct SliceFunctor : SliceFunctorBase { + explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {} + void operator()(const Tensor *input, const std::vector &output_list, StatsFuture *future); diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index 18bddcc7..034a591f 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -249,14 +249,26 @@ void TestSimplePrelu() { net.RunOp(D); } - auto expected = CreateTensor( - {2, 2, 2, 2}, - {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); + if (D == DeviceType::NEON) { + auto expected = CreateTensor( + {2, 2, 2, 2}, + {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } else { + auto expected = CreateTensor( + {2, 2, 2, 2}, + {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + } +} - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +TEST_F(ActivationOpTest, CPUSimplePrelu) { + TestSimplePrelu(); } -TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu(); } +TEST_F(ActivationOpTest, NEONSimplePrelu) { + TestSimplePrelu(); +} TEST_F(ActivationOpTest, OPENCLSimplePrelu) { TestSimplePrelu(); diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc index d482b028..e8fe0acc 100644 --- a/mace/ops/slice.cc +++ b/mace/ops/slice.cc @@ -24,6 +24,11 @@ void Register_Slice(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), SliceOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + SliceOp); } } // namespace ops diff --git a/mace/ops/slice.h b/mace/ops/slice.h index a1a6ad11..22869345 100644 --- a/mace/ops/slice.h +++ b/mace/ops/slice.h @@ -17,14 +17,16 @@ template class SliceOp : public Operator { public: SliceOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + : Operator(op_def, ws), + functor_(OperatorBase::GetSingleArgument("axis", 3)) {} bool Run(StatsFuture *future) override { MACE_CHECK(this->OutputSize() >= 2) << "There must be at least two outputs for slicing"; const Tensor *input = this->Input(INPUT); const std::vector output_list = this->Outputs(); - MACE_CHECK((input->dim(3) % this->OutputSize()) == 0) + const int32_t slice_axis = OperatorBase::GetSingleArgument("axis", 3); + MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0) << "Outputs do not split input equally."; functor_(input, output_list, future); diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc index bd0244d3..ad507af3 100644 --- a/mace/ops/slice_test.cc +++ b/mace/ops/slice_test.cc @@ -16,7 +16,7 @@ namespace test { class SliceOpTest : public OpsTestBase {}; template -void RandomTest(const int num_outputs) { +void RandomTest(const int num_outputs, const int axis) { static unsigned int seed = time(NULL); const index_t output_channels = 4 * (1 + rand_r(&seed) % 10); const index_t input_channels = num_outputs * output_channels; @@ -27,7 +27,11 @@ void RandomTest(const int num_outputs) { // Construct graph OpsTestNet net; - std::vector input_shape({batch, height, width, input_channels}); + std::vector input_shape; + if (axis == 1) + input_shape = {batch, input_channels, height, width}; + else if (axis == 3) + input_shape = {batch, height, width, input_channels}; const index_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, @@ -49,7 +53,7 @@ void RandomTest(const int num_outputs) { .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } else { - auto builder = OpDefBuilder("Slice", "SliceTest"); + auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis); builder.Input("Input"); for (int i = 0; i < num_outputs; ++i) { builder = builder.Output(MakeString("Output", i)); @@ -70,9 +74,17 @@ void RandomTest(const int num_outputs) { } // Check - std::vector expected_shape({batch, height, width, output_channels}); + std::vector expected_shape; + if (axis == 1) + expected_shape = {batch, output_channels, height, width}; + else if (axis == 3) + expected_shape = {batch, height, width, output_channels}; const index_t outer_size = std::accumulate(expected_shape.begin(), - expected_shape.end() - 1, + expected_shape.begin() + axis, + 1, + std::multiplies()); + const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1, + expected_shape.end(), 1, std::multiplies()); const float *input_ptr = input_data.data(); @@ -83,8 +95,9 @@ void RandomTest(const int num_outputs) { Tensor::MappingGuard output_mapper(output); output_ptr = output->data(); for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) { - const int idx = outer_idx * input_channels + i * output_channels; - for (int j = 0; j < output_channels; ++j) { + const int idx = (outer_idx * input_channels + i * output_channels) + * inner_size; + for (int j = 0; j < output_channels * inner_size; ++j) { ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output " << i << " index " << idx + j; } @@ -93,21 +106,27 @@ void RandomTest(const int num_outputs) { } TEST_F(SliceOpTest, CPU) { - RandomTest(2); - RandomTest(4); - RandomTest(11); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); +} + +TEST_F(SliceOpTest, CPUAxis1) { + RandomTest(2, 1); + RandomTest(4, 1); + RandomTest(11, 1); } TEST_F(SliceOpTest, OPENCLFloat) { - RandomTest(2); - RandomTest(4); - RandomTest(11); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } TEST_F(SliceOpTest, OPENCLHalf) { - RandomTest(2); - RandomTest(4); - RandomTest(11); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } } // namespace test diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 166bb6ec..7a94d280 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -68,14 +68,26 @@ def BlobToNPArray(blob): class Shapes(object): @staticmethod - def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func): + def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'): output_shape = np.zeros_like(input_shape) output_shape[0] = input_shape[0] - output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] - - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 - output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] - - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 - output_shape[3] = filter_shape[2] + if input_format == 'NHWC': + # input format: NHWC, filter format: HWOI + output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] + - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] + - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 + output_shape[3] = filter_shape[2] + elif input_format == 'NCHW': + # input format: NCHW, filter format: OIHW + output_shape[1] = filter_shape[0] + output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2] + - (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3] + - (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 + else: + raise Exception("format %s is not supported" % input_format) + return output_shape @staticmethod @@ -93,8 +105,13 @@ class Shapes(object): return output_shape @staticmethod - def slice_shape(input_shape, num_output): - return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output] + def slice_shape(input_shape, num_output, input_format='NHWC'): + if input_format == 'NHWC': + return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output] + elif input_format == 'NCHW': + return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]] + else: + raise Exception("format %s is not supported" % input_format) # outputs' name is [op.name + '_' + #] class CaffeConverter(object): @@ -168,7 +185,10 @@ class CaffeConverter(object): arg.i = self.dt data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' op_def.name = op.name op_def.type = mace_type op_def.input.extend([name+':0' for name in self.inputs_map[op.name]]) @@ -342,7 +362,11 @@ class CaffeConverter(object): # Add filter weight_tensor_name = op.name + '_weight:0' - weight_data = op.data[0].transpose((2, 3, 0, 1)) + if self.device == 'neon': + weight_data = op.data[0] + else: + # OIHW -> HWOI + weight_data = op.data[0].transpose((2, 3, 0, 1)) self.add_tensor(weight_tensor_name, weight_data) if self.device == 'gpu': @@ -376,10 +400,11 @@ class CaffeConverter(object): final_op = op self.resolved_ops.add(op.name) + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]], weight_data.shape, paddings, strides, dilations, - math.floor) + math.floor, input_format) op.output_shape_map[op.layer.top[0]] = output_shape if len(self.ops_map[final_op.name].children) == 1 \ @@ -399,9 +424,13 @@ class CaffeConverter(object): self.net_def.op.extend([op_def]) def check_winograd_conv(self, op): + # TODO: support winograd conv on neon + if self.device == 'neon': + return False param = op.layer.convolution_param filter_shape = np.asarray(op.data[0].shape) - filter_shape = filter_shape[[2, 3, 0, 1]] + if self.device != 'neon': + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) dilations = [1, 1] @@ -411,17 +440,21 @@ class CaffeConverter(object): elif len(param.dilation) == 2: dilations = [param.dilation[0], param.dilation[1]] + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape( op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, dilations, math.floor) + filter_shape, paddings, strides, dilations, math.floor, input_format) width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - return self.winograd and self.device == 'gpu' and \ - filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ - dilations[0] == 1 and (dilations[0] == dilations[1]) and \ - (strides[0] == 1) and (strides[0] == strides[1]) and \ - (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ - (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) + if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \ + (strides[0] == 1) and (strides[0] == strides[1]): + if self.device == 'gpu': + return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ + (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ + (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ + (width < OPENCL_IMAGE_MAX_SIZE) + elif self.device == 'neon': + return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3]) + return False def convert_winograd_conv(self, op): # Add filter @@ -435,11 +468,13 @@ class CaffeConverter(object): paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) filter_shape = np.asarray(op.data[0].shape) - filter_shape = filter_shape[[2, 3, 0, 1]] + if self.device != 'neon': + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape( op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, [1, 1], math.floor) + filter_shape, paddings, strides, [1, 1], math.floor, input_format) # Input transform wt_op = mace_pb2.OperatorDef() @@ -455,8 +490,12 @@ class CaffeConverter(object): wt_output_name = wt_op.name + ":0" wt_op.output.extend([wt_output_name]) wt_output_shape = mace_pb2.OutputShape() - wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) + if self.device != 'neon': + wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) + wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) + else: + wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2) + wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1]) wt_op.output_shape.extend([wt_output_shape]) # MatMul @@ -470,7 +509,10 @@ class CaffeConverter(object): matmul_output_name = matmul_op.name + ":0" matmul_op.output.extend([matmul_output_name]) matmul_output_shape = mace_pb2.OutputShape() - matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) + if self.device != 'neon': + matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) + else: + matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1]) matmul_op.output_shape.extend([matmul_output_shape]) # Inverse transform @@ -483,10 +525,10 @@ class CaffeConverter(object): batch_arg.i = output_shape[0] height_arg = iwt_op.arg.add() height_arg.name = 'height' - height_arg.i = output_shape[1] + height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2] width_arg = iwt_op.arg.add() width_arg.name = 'width' - width_arg.i = output_shape[2] + width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3] iwt_op.name = op.name + '_inverse_transform' iwt_op.type = 'WinogradInverseTransform' iwt_op.input.extend([matmul_output_name]) @@ -589,8 +631,9 @@ class CaffeConverter(object): weight_data = op.data[0].reshape(-1, op.data[0].shape[-1]) assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3]) - weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2]) - weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1) + if self.device != 'neon': + weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2]) + weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1) self.add_tensor(weight_tensor_name, weight_data) if self.device == 'gpu': if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \ @@ -665,9 +708,12 @@ class CaffeConverter(object): kernel_arg.name = 'kernels' kernel_arg.ints.extend(kernels) - filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] + filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \ + if self.device != 'neon' else \ + [input_shape[1], input_shape[1], kernels[0], kernels[1]] + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' output_shape = Shapes.conv_pool_shape(input_shape, filter_shape, - paddings, strides, [1, 1], math.ceil) + paddings, strides, [1, 1], math.ceil, input_format) op.output_shape_map[op.layer.top[0]] = output_shape op_def.output.extend([op.name + ':0']) @@ -720,7 +766,7 @@ class CaffeConverter(object): op_def = self.CommonConvert(op, 'Concat') axis_arg = op_def.arg.add() axis_arg.name = 'axis' - axis_arg.i = 3 + axis_arg.i = 3 if self.device != 'neon' else 1 try: if op.layer.concat_param.HasFeild('axis'): axis_arg.i = op.concat_param.axis @@ -766,13 +812,19 @@ class CaffeConverter(object): if len(param.slice_point) > 0: raise Exception('Mace do not support slice with slice_point') + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = 3 if self.device != 'neon' else 1 + input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] num_outputs = len(op.layer.top) - if (input_shape[3] % num_outputs) != 0 or \ - (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) : + input_channels = input_shape[axis_arg.i] + if (input_channels % num_outputs) != 0 or \ + (self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)): raise Exception('Mace do not support slice with input shape ' + str(input_shape) + ' and number of output ' + str(num_outputs)) - output_shape = Shapes.slice_shape(input_shape, num_outputs) + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format) for i in range(len(op.layer.top)): op.output_shape_map[op.layer.top[i]] = output_shape self.add_output_shape(op_def, output_shape) @@ -790,10 +842,15 @@ class CaffeConverter(object): self.resolved_ops.add(op.name) def convert_reshape(self, op): - op_def = self.CommonConvert(op, 'ReOrganize') + if self.device == 'neon': + op_def = self.CommonConvert(op, 'Reshape') + else: + op_def = self.CommonConvert(op, 'ReOrganize') input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] output_shape = input_shape - shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]] + shape_param = np.asarray(op.layer.reshape_param.shape.dim) + if self.device != 'neon': + shape_param = shape_param[[0, 3, 1, 2]] for i in range(len(shape_param)): if shape_param[i] != 0: output_shape[i] = shape_param[i] @@ -867,15 +924,50 @@ class CaffeConverter(object): assert len(input_nodes) == len(input_shapes) for i in range(len(input_nodes)): input_op = self.ops_map[input_nodes[i]] + input_shape = input_shapes[i] if self.device != 'neon' else \ + [input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]] if input_op.layer is not None: - input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i] + input_op.output_shape_map[input_op.layer.top[0]] = input_shape else: - input_op.output_shape_map[input_op.name] = input_shapes[i] + input_op.output_shape_map[input_op.name] = input_shape + + def add_neon_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'Transpose' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_neon_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'Transpose' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC def convert(self, input_nodes, input_shapes, output_nodes): if self.device == 'gpu': self.add_input_transform(input_nodes) + if self.device == 'neon': + self.add_neon_input_transform(input_nodes) + assert self.ops[0].type == 'Input' self.add_input_op_shape(input_nodes, input_shapes) @@ -924,6 +1016,9 @@ class CaffeConverter(object): if self.device == 'cpu': self.replace_in_out_name(input_nodes, output_nodes) + if self.device == 'neon': + self.add_neon_output_transform(output_nodes) + for op in self.ops: if op.name not in self.resolved_ops: print 'Unresolve Op: %s with type %s' % (op.name, op.type) -- GitLab