From 1fcd812b1a0c86b96fa3b86538248c9a3a5dca0c Mon Sep 17 00:00:00 2001 From: liutuo Date: Wed, 16 May 2018 19:39:41 +0800 Subject: [PATCH] fix ndk-r15c openmp bugs --- mace/core/runtime/cpu/cpu_runtime.cc | 1 + mace/kernels/arm/conv_2d_neon.h | 54 ++---- mace/kernels/arm/conv_2d_neon_3x3.cc | 53 +++--- mace/kernels/arm/conv_2d_neon_5x5.cc | 26 +-- mace/kernels/arm/conv_2d_neon_7x7.cc | 78 ++++----- mace/kernels/arm/depthwise_conv2d_neon.h | 24 +-- mace/kernels/arm/depthwise_conv2d_neon_3x3.cc | 79 +++++---- mace/kernels/conv_2d.h | 158 +++++++----------- mace/kernels/deconv_2d.h | 109 ++++++------ mace/kernels/depth_to_space.h | 38 ++--- mace/kernels/depthwise_conv2d.h | 92 ++++------ mace/kernels/image_to_buffer.h | 4 + mace/kernels/opencl/bias_add.cc | 3 +- mace/kernels/pooling.h | 153 +++++++---------- mace/kernels/winograd_transform.h | 1 + mace/ops/activation.h | 2 +- mace/ops/ops_test_util.h | 2 + mace/utils/logging.cc | 1 + 18 files changed, 371 insertions(+), 507 deletions(-) diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 09891e8c..8fb3ee5b 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -18,6 +18,7 @@ #include #endif +#include #include #include #include diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h index 3d3c907e..5d2d5f9a 100644 --- a/mace/kernels/arm/conv_2d_neon.h +++ b/mace/kernels/arm/conv_2d_neon.h @@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input, extern void Conv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK5x5S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK7x7S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK7x7S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK7x7S3(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); } // namespace kernels diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/kernels/arm/conv_2d_neon_3x3.cc index 0e4ac0eb..58b28ddc 100644 --- a/mace/kernels/arm/conv_2d_neon_3x3.cc +++ b/mace/kernels/arm/conv_2d_neon_3x3.cc @@ -24,22 +24,22 @@ namespace kernels { // Ho = 2, Wo = 4, Co = 2 void Conv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 2) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 2) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 1 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) @@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input, void Conv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { - for (index_t c = 0; c < in_channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t c = 0; c < in_shape[1]; ++c) { + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; const float *in_base = input + b * in_batch_size + c * in_image_size; const float *filter_ptr = filter + m * in_channels * 9 + c * 9; diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc index f4fe7ce7..3d77d8f6 100644 --- a/mace/kernels/arm/conv_2d_neon_5x5.cc +++ b/mace/kernels/arm/conv_2d_neon_5x5.cc @@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK5x5S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc index 057b9313..4432f2a0 100644 --- a/mace/kernels/arm/conv_2d_neon_7x7.cc +++ b/mace/kernels/arm/conv_2d_neon_7x7.cc @@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK7x7S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) @@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK7x7S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) @@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK7x7S3(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) diff --git a/mace/kernels/arm/depthwise_conv2d_neon.h b/mace/kernels/arm/depthwise_conv2d_neon.h index 130cd360..119867bf 100644 --- a/mace/kernels/arm/depthwise_conv2d_neon.h +++ b/mace/kernels/arm/depthwise_conv2d_neon.h @@ -22,15 +22,9 @@ namespace kernels { void DepthwiseConv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, @@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc index fb36bdad..443e57b7 100644 --- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc +++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc @@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base, // Ho = 2, Wo = 4, Co = 1 void DepthwiseConv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t* in_shape, + const index_t* out_shape, + const int* pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, @@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, MACE_UNUSED(valid_w_start); MACE_UNUSED(valid_w_stop); #endif - const index_t multiplier = out_channels / in_channels; - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_shape[1] / in_shape[1]; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; index_t multi_index = m % multiplier; const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; + const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9; float *out_base = output + b * out_batch_size + m * out_image_size; index_t h, w; + const index_t pad_top = pad_hw[0]; + const index_t pad_left = pad_hw[1]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; // top for (h = 0; h < valid_h_start; ++h) { - for (w = 0; w < out_width; ++w) { + for (w = 0; w < out_shape[3]; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, @@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, } // h #else for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { - for (index_t iw = 0; iw < out_width; ++iw) { + for (index_t iw = 0; iw < out_shape[3]; ++iw) { DepthwiseConv2dPixel(in_base, filter_ptr, ih, @@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, #endif // bottom - for (; h < out_height; ++h) { - for (w = 0; w < out_width; ++w) { + for (; h < out_shape[2]; ++h) { + for (w = 0; w < out_shape[3]; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, @@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t* in_shape, + const index_t* out_shape, + const int* pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, @@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, MACE_UNUSED(valid_w_start); MACE_UNUSED(valid_w_stop); #endif - const index_t multiplier = out_channels / in_channels; - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_shape[1] / in_shape[1]; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; index_t multi_index = m % multiplier; const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; + const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9; float *out_base = output + b * out_batch_size + m * out_image_size; index_t h, w; - + const index_t pad_top = pad_hw[0]; + const index_t pad_left = pad_hw[1]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; // top for (h = 0; h < valid_h_start; ++h) { for (w = 0; w < out_width; ++w) { @@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, #endif // bottom - for (; h < out_height; ++h) { - for (w = 0; w < out_width; ++w) { + for (; h < out_shape[2]; ++h) { + for (w = 0; w < out_shape[3]; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index dfe5540d..7f825828 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -84,49 +84,46 @@ struct Conv2dFunctor : Conv2dFunctorBase { void Conv2dGeneral(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, + const index_t *in_shape, + const index_t *out_shape, + const index_t *filter_shape, + const int *stride_hw, + const int *dilation_hw, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - const index_t filter_size = filter_height * filter_width; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = filter_shape[1] * in_image_size; + const index_t out_batch_size = filter_shape[0] * out_image_size; + const index_t filter_size = filter_shape[2] * filter_shape[3]; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < in_shape[0]; b++) { + for (index_t m = 0; m < filter_shape[0]; m += 4) { + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + + const int stride_h = stride_hw[0]; + const int stride_w = stride_hw[1]; + const int dilation_h = dilation_hw[0]; + const int dilation_w = dilation_hw[1]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; + float *out_ptr1_base = out_ptr0_base + out_image_size; + float *out_ptr2_base = out_ptr1_base + out_image_size; + float *out_ptr3_base = out_ptr2_base + out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input + b * in_batch_size + c * in_image_size; const float *filter_ptr0 = filter + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = - filter + (m + 1) * in_channels * filter_size + c * filter_size; - const float *filter_ptr2 = - filter + (m + 2) * in_channels * filter_size + c * filter_size; - const float *filter_ptr3 = - filter + (m + 3) * in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; + const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; + const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -144,8 +141,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { vo3[ow] = out_ptr3_base[out_offset + ow]; } // calc by row - for (index_t kh = 0; kh < filter_height; ++kh) { - for (index_t kw = 0; kw < filter_width; ++kw) { + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { // outch 0 vo0[0] += in_ptr_base[in_offset + kw * dilation_w] * filter_ptr0[kw]; @@ -185,10 +182,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // kw in_offset += dilation_h * in_width; - filter_ptr0 += filter_width; - filter_ptr1 += filter_width; - filter_ptr2 += filter_width; - filter_ptr3 += filter_width; + filter_ptr0 += filter_shape[3]; + filter_ptr1 += filter_shape[3]; + filter_ptr2 += filter_shape[3]; + filter_ptr3 += filter_shape[3]; } // kh for (index_t ow = 0; ow < 4; ++ow) { @@ -230,8 +227,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // calc by row - for (index_t kh = 0; kh < filter_height; ++kh) { - for (index_t kw = 0; kw < filter_width; ++kw) { + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { // outch 0 vo0[0] += in_ptr_base[in_offset + kw * dilation_w] * filter_ptr0[kw]; @@ -244,7 +241,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // kw in_offset += dilation_h * in_width; - filter_ptr0 += filter_width; + filter_ptr0 += filter_shape[3]; } // kh for (index_t ow = 0; ow < 4; ++ow) { @@ -325,6 +322,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { index_t dilation_h = dilations_[0]; index_t dilation_w = dilations_[1]; + const index_t filter_hw[2] = {filter_h, filter_w}; + MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); index_t padded_input_height = input_height + paddings[0]; @@ -478,6 +477,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT); Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT); Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT); + const index_t extra_input_shape[4] = + {batch, input_channels, extra_input_height, extra_input_width}; + const index_t extra_output_shape[4] = + {batch, channels, extra_output_height, extra_output_width}; // decide which convolution function to call if (use_winograd) { @@ -512,6 +515,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { float *transformed_input_data = transformed_input.mutable_data(); float *transformed_output_data = transformed_output.mutable_data(); + conv_func = [=](const float *pad_input, float *pad_output) { WinoGradConv3x3s1(pad_input, transformed_filter_ptr, @@ -529,26 +533,16 @@ struct Conv2dFunctor : Conv2dFunctorBase { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S1(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_3x3_s2) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S2(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_1x1_s1) { @@ -566,71 +560,43 @@ struct Conv2dFunctor : Conv2dFunctorBase { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK5x5S1(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_7x7_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK7x7S1(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_7x7_s2) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK7x7S2(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_7x7_s3) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK7x7S3(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dGeneral(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, + extra_input_shape, + extra_output_shape, + filter_shape.data(), + strides_, + dilations_, pad_output); }; } diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 14d78a45..7c20addd 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -41,48 +41,40 @@ template void Deconv2dNCHW(const T *input, const T *filter, const T *bias, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const index_t filter_height, - const index_t filter_width, - const index_t stride_h, - const index_t stride_w, - const int padding_top, - const int padding_left, + const index_t *in_shape, + const index_t *out_shape, + const index_t *kernel_hw, + const int *strides, + const int *padding, float *output) { #pragma omp parallel for collapse(4) - for (index_t b = 0; b < batch; ++b) { - for (index_t oc = 0; oc < out_channels; ++oc) { - for (index_t oh = 0; oh < out_height; ++oh) { - for (index_t ow = 0; ow < out_width; ++ow) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < out_shape[1]; ++oc) { + for (index_t oh = 0; oh < out_shape[2]; ++oh) { + for (index_t ow = 0; ow < out_shape[3]; ++ow) { index_t filter_start_y, filter_start_x; - index_t start_x = std::max(0, ow + stride_w -1 - padding_left); - index_t start_y = std::max(0, oh + stride_h -1 - padding_top); - start_x /= stride_w; - start_y /= stride_h; - filter_start_x = padding_left + stride_w * start_x - ow; - filter_start_y = padding_top + stride_h * start_y - oh; - filter_start_x = filter_width - 1 - filter_start_x; - filter_start_y = filter_height - 1 - filter_start_y; + index_t start_x = std::max(0, ow + strides[1] -1 - padding[1]); + index_t start_y = std::max(0, oh + strides[0] -1 - padding[0]); + start_x /= strides[1]; + start_y /= strides[0]; + filter_start_x = padding[1] + strides[1] * start_x - ow; + filter_start_y = padding[0] + strides[0] * start_y - oh; + filter_start_x = kernel_hw[1] - 1 - filter_start_x; + filter_start_y = kernel_hw[0] - 1 - filter_start_y; T out_value = 0; index_t out_pos = - ((b * out_channels + oc) * out_height + oh) * out_width + ow; - for (index_t ic = 0; ic < in_channels; ++ic) { + ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow; + for (index_t ic = 0; ic < in_shape[1]; ++ic) { for (index_t f_y = filter_start_y, ih = start_y; - f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) { + f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) { for (index_t f_x = filter_start_x, iw = start_x; - f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) { + f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) { index_t weight_pos = - ((oc * in_channels + ic) * filter_height + f_y) - * filter_width + f_x; + ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y) + * kernel_hw[1] + f_x; index_t in_pos = - ((b * in_channels + ic) * in_height + ih) - * in_width + iw; + ((b * in_shape[1] + ic) * in_shape[2] + ih) + * in_shape[3] + iw; out_value += input[in_pos] * filter[weight_pos]; } } @@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { paddings_.data(), true); output->Resize(output_shape_); } - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t height = output->dim(2); - index_t width = output->dim(3); - - index_t input_batch = input->dim(0); - index_t input_channels = input->dim(1); - index_t input_height = input->dim(2); - index_t input_width = input->dim(3); - index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); - MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); - MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", - input_channels); - - index_t stride_h = strides_[0]; - index_t stride_w = strides_[1]; - - MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); + const index_t *in_shape = input->shape().data(); + const index_t *out_shape = output->shape().data(); + const index_t kernel_hw[2] = {kernel_h, kernel_w}; + + MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ", + output_shape[1]); + MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ", + in_shape[1]); + MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch"); Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard bias_mapper(bias); @@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { auto filter_data = filter->data(); auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); - int padding_top = (paddings_[0] + 1) >> 1; - int padding_left = (paddings_[1] + 1) >> 1; - - deconv::Deconv2dNCHW(input_data, filter_data, bias_data, - batch, input_height, input_width, input_channels, - height, width, channels, - kernel_h, kernel_w, - stride_h, stride_w, padding_top, padding_left, + int padding[2]; + padding[0] = (paddings_[0] + 1) >> 1; + padding[1] = (paddings_[1] + 1) >> 1; + deconv::Deconv2dNCHW(input_data, + filter_data, + bias_data, + in_shape, + out_shape, + kernel_hw, + strides_, + padding, output_data); - DoActivation(output_data, output_data, output->size(), activation_, + DoActivation(output_data, + output_data, + output->size(), + activation_, relux_max_limit_); } }; diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 733591a5..2afd905b 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor { : block_size_(block_size), d2s_(d2s) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { MACE_UNUSED(future); - const int batch_size = input->dim(0); - const int input_depth = input->dim(1); - const int input_height = input->dim(2); - const int input_width = input->dim(3); + const index_t batch_size = input->dim(0); + const index_t input_depth = input->dim(1); + const index_t input_height = input->dim(2); + const index_t input_width = input->dim(3); index_t output_depth, output_width, output_height; @@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor { if (d2s_) { #pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - for (int d = 0; d < output_depth; ++d) { - for (int h = 0; h < output_height; ++h) { - const int in_h = h / block_size_; - const int offset_h = (h % block_size_); + for (index_t b = 0; b < batch_size; ++b) { + for (index_t d = 0; d < output_depth; ++d) { + for (index_t h = 0; h < output_height; ++h) { + const index_t in_h = h / block_size_; + const index_t offset_h = (h % block_size_); for (int w = 0; w < output_width; ++w) { const index_t in_w = w / block_size_; const index_t offset_w = w % block_size_; @@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor { } } else { #pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - for (int d = 0; d < input_depth; ++d) { - for (int h = 0; h < input_height; ++h) { - const int out_h = h / block_size_; - const int offset_h = (h % block_size_); - for (int w = 0; w < input_width; ++w) { - const int out_w = w / block_size_; - const int offset_w = (w % block_size_); - const int offset_d = + for (index_t b = 0; b < batch_size; ++b) { + for (index_t d = 0; d < input_depth; ++d) { + for (index_t h = 0; h < input_height; ++h) { + const index_t out_h = h / block_size_; + const index_t offset_h = (h % block_size_); + for (index_t w = 0; w < input_width; ++w) { + const index_t out_w = w / block_size_; + const index_t offset_w = (w % block_size_); + const index_t offset_d = (offset_h * block_size_ + offset_w) * input_depth; - const int out_d = d + offset_d; + const index_t out_d = d + offset_d; const index_t o_index = ((b * output_depth + out_d) * output_height + out_h) * output_width + out_w; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index ce3c1e48..a276b504 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor void DepthwiseConv2dGeneral(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const index_t *filter_shape, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, float *output) { - const index_t multiplier = out_channels / in_channels; + const index_t multiplier = filter_shape[0] / filter_shape[1]; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < filter_shape[0]; ++m) { + for (index_t h = 0; h < out_shape[2]; ++h) { + for (index_t w = 0; w < out_shape[3]; ++w) { + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + const index_t filter_height = filter_shape[2]; + const index_t filter_width = filter_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; index_t out_offset = ((b * out_channels + m) * out_height + h) * out_width + w; index_t c = m / multiplier; @@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor float sum = 0; for (index_t kh = 0; kh < filter_height; ++kh) { for (index_t kw = 0; kw < filter_width; ++kw) { - index_t ih = h * stride_h + kh * dilation_h - pad_top; - index_t iw = w * stride_w + kw * dilation_w - pad_left; + index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0]; + index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1]; if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { index_t in_offset = ((b * in_channels + c) * in_height + ih) * in_width + iw; @@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); + const int pad_hw[2] = {pad_top, pad_left}; + const index_t input_shape[4] = + {batch, input_channels, input_height, input_width}; + if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { conv_func = [=](const float *input, float *output) { DepthwiseConv2dNeonK3x3S1(input, filter_data, - batch, - input_height, - input_width, - input_channels, - height, - width, - channels, - pad_top, - pad_left, + input_shape, + output_shape.data(), + pad_hw, valid_h_start, valid_h_stop, valid_w_start, @@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor conv_func = [=](const float *input, float *output) { DepthwiseConv2dNeonK3x3S2(input, filter_data, - batch, - input_height, - input_width, - input_channels, - height, - width, - channels, - pad_top, - pad_left, + input_shape, + output_shape.data(), + pad_hw, valid_h_start, valid_h_stop, valid_w_start, @@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor conv_func = [=](const float *input, float *output) { DepthwiseConv2dGeneral(input, filter_data, - batch, - input_height, - input_width, - input_channels, - height, - width, - channels, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - pad_top, - pad_left, + input_shape, + output_shape.data(), + filter_shape.data(), + strides_, + dilations_, + pad_hw, output); }; } diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h index ce08e51f..b6a7370d 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/image_to_buffer.h @@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { const BufferType type, Tensor *output, StatsFuture *future) { + MACE_UNUSED(input); + MACE_UNUSED(type); + MACE_UNUSED(output); + MACE_UNUSED(future); MACE_NOT_IMPLEMENTED; } }; diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index b6d2b4b1..05b9d169 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -90,7 +90,8 @@ void BiasAddFunctor::operator()(const Tensor *input, } else { std::vector roundup_gws(lws.size()); for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); + if (lws[i] != 0) + roundup_gws[i] = RoundUp(gws[i], lws[i]); } error = runtime->command_queue().enqueueNDRangeKernel( diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 97a65f1e..9c510b34 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -75,39 +75,38 @@ struct PoolingFunctor: PoolingFunctorBase { } void MaxPooling(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t channels, - const index_t out_height, - const index_t out_width, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = channels * in_image_size; - const index_t out_batch_size = channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < out_shape[1]; ++c) { const index_t out_base = b * out_batch_size + c * out_image_size; const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w < out_width; ++w) { const index_t out_offset = out_base + h * out_width + w; float res = std::numeric_limits::lowest(); - for (int fh = 0; fh < filter_height; ++fh) { - for (int fw = 0; fw < filter_width; ++fw) { - int inh = h * stride_h + dilation_h * fh - pad_top; - int inw = w * stride_w + dilation_w * fw - pad_left; + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { index_t input_offset = in_base + inh * in_width + inw; res = std::max(res, input[input_offset]); @@ -122,40 +121,38 @@ struct PoolingFunctor: PoolingFunctorBase { } void AvgPooling(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t channels, - const index_t out_height, - const index_t out_width, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = channels * in_image_size; - const index_t out_batch_size = channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < out_shape[1]; ++c) { const index_t out_base = b * out_batch_size + c * out_image_size; const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w < out_width; ++w) { const index_t out_offset = out_base + h * out_width + w; float res = 0; int block_size = 0; - for (int fh = 0; fh < filter_height; ++fh) { - for (int fw = 0; fw < filter_width; ++fw) { - int inh = h * stride_h + dilation_h * fh - pad_top; - int inw = w * stride_w + dilation_w * fw - pad_left; + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { index_t input_offset = in_base + inh * in_width + inw; res += input[input_offset]; @@ -200,59 +197,25 @@ struct PoolingFunctor: PoolingFunctorBase { const float *input = input_tensor->data(); float *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t height = output_shape[2]; - index_t width = output_shape[3]; - - index_t input_height = input_shape[2]; - index_t input_width = input_shape[3]; - - int filter_h = kernels_[0]; - int filter_w = kernels_[1]; - - int stride_h = strides_[0]; - int stride_w = strides_[1]; - - int dilation_h = dilations_[0]; - int dilation_w = dilations_[1]; - - int pad_top = paddings[0] / 2; - int pad_left = paddings[1] / 2; + int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; if (pooling_type_ == PoolingType::MAX) { MaxPooling(input, - batch, - input_height, - input_width, - channels, - height, - width, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - pad_top, - pad_left, + input_shape, + output_shape.data(), + kernels_, + strides_, + dilations_, + pad_hw, output); } else if (pooling_type_ == PoolingType::AVG) { AvgPooling(input, - batch, - input_height, - input_width, - channels, - height, - width, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - pad_top, - pad_left, + input_shape, + output_shape.data(), + kernels_, + strides_, + dilations_, + pad_hw, output); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index 06b6182e..4e53ee7a 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { MACE_UNUSED(input); MACE_UNUSED(bias); MACE_UNUSED(output); + MACE_UNUSED(future); MACE_NOT_IMPLEMENTED; } }; diff --git a/mace/ops/activation.h b/mace/ops/activation.h index 5f07425a..7c6d3b56 100644 --- a/mace/ops/activation.h +++ b/mace/ops/activation.h @@ -38,7 +38,7 @@ class ActivationOp : public Operator { const Tensor *input_tensor = this->Input(0); const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr; - Tensor *output_tensor = this->outputs_[0]; + Tensor *output_tensor = this->Output(0); output_tensor->ResizeLike(input_tensor); functor_(input_tensor, alpha_tensor, output_tensor, future); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index bf9006fb..0f46e7a0 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -618,6 +618,8 @@ struct Expector { static void Near(const Tensor &x, const Tensor &y, const double rel_err, const double abs_err) { + MACE_UNUSED(rel_err); + MACE_UNUSED(abs_err); Equal(x, y); } }; diff --git a/mace/utils/logging.cc b/mace/utils/logging.cc index 52ddca5f..a8b06e69 100644 --- a/mace/utils/logging.cc +++ b/mace/utils/logging.cc @@ -15,6 +15,7 @@ #include "mace/utils/logging.h" #include +#include #if defined(ANDROID) || defined(__ANDROID__) #include #include -- GitLab