diff --git a/mace/kernels/arm/deconv_2d_neon_3x3.cc b/mace/kernels/arm/deconv_2d_neon_3x3.cc index c8f5006bbb66fb1a56e7a706fca5208de51c0385..cdba42c0dfb1439946e233f9e5704e29f09e05e9 100644 --- a/mace/kernels/arm/deconv_2d_neon_3x3.cc +++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc @@ -293,11 +293,11 @@ void Deconv2dNeonK3x3S2(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(2) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; ++oc) { + float *out_base = output + (b * outch + oc) * out_img_size; for (index_t ic = 0; ic < inch; ++ic) { - float *out_base = output + (b * outch + oc) * out_img_size; const float *input_base = input + (b * inch + ic) * h * w; const float *kernel_base = filter + (oc * inch + ic) * 9; const float *in = input_base; diff --git a/mace/kernels/arm/deconv_2d_neon_4x4.cc b/mace/kernels/arm/deconv_2d_neon_4x4.cc index dd371ada1d223cdad6928d3e6cde6cf152ad2225..575a8494643f4fd9fa383d0f0222d09cc48930f2 100644 --- a/mace/kernels/arm/deconv_2d_neon_4x4.cc +++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc @@ -386,11 +386,11 @@ void Deconv2dNeonK4x4S2(const float *input, const index_t outch = out_shape[1]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(2) for (int b = 0; b < out_shape[0]; ++b) { for (int p = 0; p < outch; p++) { + float *out_base = output + (b * outch + p) * out_img_size; for (int q = 0; q < inch; q++) { - float *out_base = output + (b * outch + p) * out_img_size; const float *input_base = input + (b * inch + q) * h * w; const float *kernel_base = filter + (p * inch + q) * 16; const float *in = input_base; diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index ff1875fee9d146d9c90bcc88f5b461cd6e22da24..7e1ed460272658f871b2695254183a716868399c 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -214,13 +214,13 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { const index_t out_channels = out_shape[1]; const index_t in_channels = in_shape[1]; -#pragma omp parallel for collapse(4) +#pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { for (int oc = 0; oc < out_channels; ++oc) { + float *out_base = + output + (b * out_channels + oc) * out_img_size; for (int i = 0; i < in_height; ++i) { for (int j = 0; j < in_width; ++j) { - float *out_base = - output + (b * out_channels + oc) * out_img_size; const index_t out_offset = i * strides[0] * out_width + j * strides[1]; for (int ic = 0; ic < in_channels; ++ic) {