diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 803bc2407fa23cbc57d0114155df36d1d633dd52..472733af3656db1969e278ee9743b2510c2980ea 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -15,30 +15,30 @@ namespace kernels { template class DepthwiseConv2dFunctor { public: - DepthwiseConv2dFunctor(const index_t* input_shape, - const index_t* filter_shape, - const int* strides, + DepthwiseConv2dFunctor(const index_t *input_shape, + const index_t *filter_shape, + const int *strides, const Padding padding, - const int* dilations) : + const int *dilations) : strides_(strides), paddings_(2, 0), dilations_(dilations) { CalPaddingSize(input_shape, filter_shape, dilations_, strides_, padding, paddings_.data()); } - DepthwiseConv2dFunctor(const int* strides, - const std::vector& paddings, - const int* dilations) : + DepthwiseConv2dFunctor(const int *strides, + const std::vector &paddings, + const int *dilations) : strides_(strides), paddings_(paddings), dilations_(dilations) {} - void operator()(const T* input, // NCHW - const index_t* input_shape, - const T* filter, // c_out, c_in, kernel_h, kernel_w - const index_t* filter_shape, - const T* bias, // c_out - T* output, // NCHW - const index_t* output_shape) { + void operator()(const T *input, // NCHW + const index_t *input_shape, + const T *filter, // c_out, c_in, kernel_h, kernel_w + const index_t *filter_shape, + const T *bias, // c_out + T *output, // NCHW + const index_t *output_shape) { MACE_CHECK_NOTNULL(output); @@ -80,7 +80,7 @@ class DepthwiseConv2dFunctor { index_t offset = n * channels * height * width + c * height * width + h * width + w; T sum = 0; - const T* filter_ptr = filter + c * kernel_size; + const T *filter_ptr = filter + c * kernel_size; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; @@ -110,19 +110,19 @@ class DepthwiseConv2dFunctor { } } private: - const int* strides_; // [stride_h, stride_w] + const int *strides_; // [stride_h, stride_w] std::vector paddings_; // [padding_h, padding_w] - const int* dilations_; // [dilation_h, dilation_w] + const int *dilations_; // [dilation_h, dilation_w] }; -template <> -void DepthwiseConv2dFunctor::operator()(const float* input, - const index_t* input_shape, - const float* filter, - const index_t* filter_shape, - const float* bias, - float* output, - const index_t* output_shape); +template<> +void DepthwiseConv2dFunctor::operator()(const float *input, + const index_t *input_shape, + const float *filter, + const index_t *filter_shape, + const float *bias, + float *output, + const index_t *output_shape); } // namespace kernels } // namespace mace diff --git a/mace/kernels/neon/conv_2d_neon_1x1.cc b/mace/kernels/neon/conv_2d_neon_1x1.cc index 006864a3aeab5d0518ebe646dfe11b6d2d64a4d4..a82505e79296e7f362139643bd700584d6d89caa 100644 --- a/mace/kernels/neon/conv_2d_neon_1x1.cc +++ b/mace/kernels/neon/conv_2d_neon_1x1.cc @@ -8,13 +8,13 @@ namespace mace { namespace kernels { -void Conv2dNeonK1x1S1(const float* input, // NCHW - const index_t* input_shape, - const float* filter, // c_out, c_in, kernel_h, kernel_w - const index_t* filter_shape, - const float* bias, // c_out - float* output, // NCHW - const index_t* output_shape) { +void Conv2dNeonK1x1S1(const float *input, // NCHW + const index_t *input_shape, + const float *filter, // c_out, c_in, kernel_h, kernel_w + const index_t *filter_shape, + const float *bias, // c_out + float *output, // NCHW + const index_t *output_shape) { const index_t batch = output_shape[0]; const index_t channels = output_shape[1]; const index_t height = output_shape[2]; @@ -26,7 +26,7 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW const index_t input_width = input_shape[3]; MACE_CHECK(input_batch == batch && input_height == height && - input_width == width); + input_width == width); const index_t total_pixels = height * width; // Process 4 * 2 = 8 pixels for each innermost loop @@ -36,17 +36,17 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW // benchmark omp collapsed(2) for (index_t n = 0; n < batch; ++n) { - const float* filter_ptr = filter; + const float *filter_ptr = filter; #pragma omp parallel for for (index_t c = 0; c < channels; ++c) { // TODO Will GCC opt these out? - float* channel_output_start = + float *channel_output_start = output + n * channels * height * width + c * height * width; - const float* input_ptr = + const float *input_ptr = input + n * input_channels * input_height * input_width; // Fill with bias - float* output_ptr = channel_output_start; + float *output_ptr = channel_output_start; for (index_t ptr = 0; ptr < total_pixels; ++ptr) { output_ptr[ptr] = bias[c]; // TODO can we avoid this? } @@ -54,15 +54,15 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW index_t inc = 0; // Process 4 input channels in batch for (; inc + 3 < input_channels; inc += 4) { - float* output_ptr = channel_output_start; + float *output_ptr = channel_output_start; // The begining of each input feature map channel MACE_ASSERT(input_ptr == - input + n * input_channels * input_height * input_width + - inc * input_height * input_width); + input + n * input_channels * input_height * input_width + + inc * input_height * input_width); - const float* input_ptr1 = input_ptr + total_pixels; - const float* input_ptr2 = input_ptr1 + total_pixels; - const float* input_ptr3 = input_ptr2 + total_pixels; + const float *input_ptr1 = input_ptr + total_pixels; + const float *input_ptr2 = input_ptr1 + total_pixels; + const float *input_ptr3 = input_ptr2 + total_pixels; // filter is in c_out, c_in, 1, 1 order MACE_ASSERT(filter_ptr == filter + c * input_channels + inc); @@ -140,10 +140,10 @@ void Conv2dNeonK1x1S1(const float* input, // NCHW } // Process the remaining channels for (; inc < input_channels; ++inc) { - float* output_ptr = channel_output_start; + float *output_ptr = channel_output_start; MACE_ASSERT(input_ptr == - input + n * input_channels * input_height * input_width + - inc * input_height * input_width); + input + n * input_channels * input_height * input_width + + inc * input_height * input_width); MACE_ASSERT(filter_ptr == filter + c * input_channels + inc); const float k0 = filter_ptr[0]; diff --git a/mace/kernels/neon/conv_2d_neon_3x3.cc b/mace/kernels/neon/conv_2d_neon_3x3.cc index 2b29ee6363e9a9f781788fcefeb95168e82bafe0..6b62cb5937f84c7169e1da05883e9eaf40da701c 100644 --- a/mace/kernels/neon/conv_2d_neon_3x3.cc +++ b/mace/kernels/neon/conv_2d_neon_3x3.cc @@ -20,19 +20,18 @@ namespace kernels { int multiplier = filter_shape == nullptr ? 0 : (filter_shape[0] / input_channels); \ int filter_in_channels = filter_shape == nullptr ? input_channels : filter_shape[1]; \ for (int b = 0; b < output_batch; ++b) { \ - float* output_ptr_base = output + b * output_channels * output_height * output_width; \ + float *output_ptr_base = output + b * output_channels * output_height * output_width; \ for (int oc = 0; oc < output_channels; ++oc) { \ - const float* filter_ptr = filter + oc * filter_in_channels * kFilterSize; \ - const float* input_ptr = input + b * input_channels * input_height * input_width; \ + const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize; \ + const float *input_ptr = input + b * input_channels * input_height * input_width; \ if (filter_shape != nullptr) { \ input_ptr += (oc / multiplier) * input_height * input_width; \ } \ - float* output_ptr = output_ptr_base + oc * output_height * output_width; \ + float *output_ptr = output_ptr_base + oc * output_height * output_width; \ std::fill(output_ptr, output_ptr + output_height * output_width, bias[oc]); \ for (int ic = 0; ic < filter_in_channels; ++ic) { \ float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), vld1q_f32(filter_ptr+3), vld1q_f32(filter_ptr+6)}; - #define KERNEL_TAIL_CODE \ filter_ptr += kFilterSize; \ input_ptr += input_height * input_width; \ diff --git a/mace/kernels/neon/conv_2d_neon_5x5.cc b/mace/kernels/neon/conv_2d_neon_5x5.cc index 724fe3e74bb44d3627201933749b92bf0aac452f..02c5ced2a3177af71544c6ccaf324cc133f686cf 100644 --- a/mace/kernels/neon/conv_2d_neon_5x5.cc +++ b/mace/kernels/neon/conv_2d_neon_5x5.cc @@ -10,13 +10,13 @@ namespace mace { namespace kernels { -void Conv2dNeonK5x5S1(const float* input, // NCHW - const index_t* input_shape, - const float* filter, // c_out, c_in, kernel_h, kernel_w - const index_t* filter_shape, - const float* bias, // c_out - float* output, // NCHW - const index_t* output_shape) { +void Conv2dNeonK5x5S1(const float *input, // NCHW + const index_t *input_shape, + const float *filter, // c_out, c_in, kernel_h, kernel_w + const index_t *filter_shape, + const float *bias, // c_out + float *output, // NCHW + const index_t *output_shape) { const index_t batch = output_shape[0]; const index_t channels = output_shape[1]; const index_t height = output_shape[2]; @@ -40,9 +40,9 @@ void Conv2dNeonK5x5S1(const float* input, // NCHW #pragma omp parallel for collapse(2) for (index_t n = 0; n < batch; ++n) { for (index_t c = 0; c < channels; ++c) { - float* output_ptr = output + n * output_total_pixels_per_batch + - c * output_total_pixels_per_channel; - const float* input_ptr = input + n * input_total_pixels_per_batch; + float *output_ptr = output + n * output_total_pixels_per_batch + + c * output_total_pixels_per_channel; + const float *input_ptr = input + n * input_total_pixels_per_batch; // Fill with bias for (index_t i = 0; i < output_total_pixels_per_channel; ++i) { @@ -50,24 +50,24 @@ void Conv2dNeonK5x5S1(const float* input, // NCHW } for (index_t inc = 0; inc < input_channels; ++inc) { - float* outptr = output_ptr; - float* outptr2 = outptr + width; - - const float* inptr = input_ptr + inc * input_total_pixels_per_channel; - const float* filter_ptr = filter + c * patch_size + inc * 25; - - const float* r0 = inptr; - const float* r1 = inptr + input_width; - const float* r2 = inptr + input_width * 2; - const float* r3 = inptr + input_width * 3; - const float* r4 = inptr + input_width * 4; - const float* r5 = inptr + input_width * 5; - - const float* k0 = filter_ptr; - const float* k1 = filter_ptr + 5; - const float* k2 = filter_ptr + 10; - const float* k3 = filter_ptr + 15; - const float* k4 = filter_ptr + 20; + float *outptr = output_ptr; + float *outptr2 = outptr + width; + + const float *inptr = input_ptr + inc * input_total_pixels_per_channel; + const float *filter_ptr = filter + c * patch_size + inc * 25; + + const float *r0 = inptr; + const float *r1 = inptr + input_width; + const float *r2 = inptr + input_width * 2; + const float *r3 = inptr + input_width * 3; + const float *r4 = inptr + input_width * 4; + const float *r5 = inptr + input_width * 5; + + const float *k0 = filter_ptr; + const float *k1 = filter_ptr + 5; + const float *k2 = filter_ptr + 10; + const float *k3 = filter_ptr + 15; + const float *k4 = filter_ptr + 20; float32x4_t _k0123 = vld1q_f32(filter_ptr); float32x4_t _k4567 = vld1q_f32(filter_ptr + 4); diff --git a/mace/kernels/neon/depthwise_conv_neon.cc b/mace/kernels/neon/depthwise_conv_neon.cc index 7bf0a839ab0d7db294beb3e3bf073841bc84b986..eda2325d8b371218f2dcedefd34c124e3b75a9e9 100644 --- a/mace/kernels/neon/depthwise_conv_neon.cc +++ b/mace/kernels/neon/depthwise_conv_neon.cc @@ -25,13 +25,13 @@ extern void Conv2dNeonK3x3S2(const float *input, const index_t *output_shape); template<> -void DepthwiseConv2dFunctor::operator()(const float* input, // NCHW - const index_t* input_shape, - const float* filter, // c_out, c_in, kernel_h, kernel_w - const index_t* filter_shape, - const float* bias, // c_out - float* output, // NCHW - const index_t* output_shape) { +void DepthwiseConv2dFunctor::operator()(const float *input, // NCHW + const index_t *input_shape, + const float *filter, // c_out, c_in, kernel_h, kernel_w + const index_t *filter_shape, + const float *bias, // c_out + float *output, // NCHW + const index_t *output_shape) { typedef void (*Conv2dNeonFunction)( const float *input, const index_t *input_shape, diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index ad3206b0045db7be28452fcfc602ffc5da9082ff..3ac6689cd8d8a6f5198c56a70623ed50a7d6e0b7 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -13,17 +13,17 @@ namespace mace { -template +template class Conv2dOp : public ConvPool2dOpBase { public: - Conv2dOp(const OperatorDef& op_def, Workspace* ws) - : ConvPool2dOpBase(op_def, ws){}; + Conv2dOp(const OperatorDef &op_def, Workspace *ws) + : ConvPool2dOpBase(op_def, ws) {}; bool Run() override { - const Tensor* input = this->Input(INPUT); - const Tensor* filter = this->Input(FILTER); - const Tensor* bias = this->Input(BIAS); - Tensor* output = this->Output(OUTPUT); + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->Input(BIAS); + Tensor *output = this->Output(OUTPUT); std::vector output_shape(4); std::vector paddings(2); diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index b6a458ead4e1c8c08630772a5a7f161ace2a3cd8..cc220f3c5f5848bf5e989adc466c585153eb55d7 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -14,25 +14,25 @@ namespace mace { -template +template class DepthwiseConv2dOp : public ConvPool2dOpBase { public: - DepthwiseConv2dOp(const OperatorDef& op_def, Workspace* ws) + DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws) : ConvPool2dOpBase(op_def, ws), functor_(this->Input(INPUT)->shape().data(), this->Input(FILTER)->shape().data(), - this->strides_.data(), this->padding_, this->dilations_.data()){}; + this->strides_.data(), this->padding_, this->dilations_.data()) {}; bool Run() override { - const Tensor* input = this->Input(INPUT); - const Tensor* filter = this->Input(FILTER); - const Tensor* bias = this->Input(BIAS); - Tensor* output = this->Output(OUTPUT); + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->Input(BIAS); + Tensor *output = this->Output(OUTPUT); // resize filter shape. std::vector filter_shape(filter->shape().begin(), filter->shape().end()); filter_shape[0] *= filter_shape[1]; - filter_shape[1] = 1; + filter_shape[1] = 1; std::vector output_shape(4); this->CalOutputSize(input->shape().data(), filter_shape.data(), output_shape.data()); output->Resize(output_shape);