diff --git a/mace/kernels/arm/deconv_2d_neon.h b/mace/kernels/arm/deconv_2d_neon.h index 87b86b066bb1274ca3a3350a85b4d409880d43d6..1cddbf1a264c161bb767eddc4f1a7b5f4192e33d 100644 --- a/mace/kernels/arm/deconv_2d_neon.h +++ b/mace/kernels/arm/deconv_2d_neon.h @@ -26,28 +26,24 @@ namespace kernels { void Deconv2dNeonK3x3S1(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output); void Deconv2dNeonK3x3S2(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output); void Deconv2dNeonK4x4S1(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output); void Deconv2dNeonK4x4S2(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output); diff --git a/mace/kernels/arm/deconv_2d_neon_3x3.cc b/mace/kernels/arm/deconv_2d_neon_3x3.cc index d4d7d0cdffe767ca8dd5bf8092bbce57b504d8f5..c8f5006bbb66fb1a56e7a706fca5208de51c0385 100644 --- a/mace/kernels/arm/deconv_2d_neon_3x3.cc +++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc @@ -20,7 +20,6 @@ namespace kernels { void Deconv2dNeonK3x3S1(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output) { @@ -40,12 +39,6 @@ void Deconv2dNeonK3x3S1(const float *input, if (oc + 1 < outch) { float *out_base0 = output + (b * outch + oc) * out_img_size; float *out_base1 = out_base0 + out_img_size; - - const float bias_value0 = bias ? bias[oc] : 0.f; - const float bias_value1 = bias ? bias[oc + 1] : 0.f; - std::fill_n(out_base0, out_img_size, bias_value0); - std::fill_n(out_base1, out_img_size, bias_value1); - for (index_t ic = 0; ic < inch; ++ic) { const float *input_base = input + (b * inch + ic) * h * w; const float *kernel_base0 = filter + (oc * inch + ic) * 9; @@ -197,8 +190,6 @@ void Deconv2dNeonK3x3S1(const float *input, } } else { float *out_base0 = output + (b * outch + oc) * outh * outw; - const float bias_value0 = bias ? bias[oc] : 0.f; - std::fill_n(out_base0, outh * outw, bias_value0); for (index_t ic = 0; ic < inch; ++ic) { const float *input_base = input + (b * inch + ic) * h * w; const float *kernel_base0 = filter + (oc * inch + ic) * 9; @@ -290,7 +281,6 @@ void Deconv2dNeonK3x3S1(const float *input, void Deconv2dNeonK3x3S2(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output) { @@ -303,15 +293,11 @@ void Deconv2dNeonK3x3S2(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(3) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; ++oc) { - float *out_base = output + (b * outch + oc) * out_img_size; - - const float bias_value = bias ? bias[oc] : 0.f; - std::fill_n(out_base, out_img_size, bias_value); - for (index_t ic = 0; ic < inch; ++ic) { + float *out_base = output + (b * outch + oc) * out_img_size; const float *input_base = input + (b * inch + ic) * h * w; const float *kernel_base = filter + (oc * inch + ic) * 9; const float *in = input_base; diff --git a/mace/kernels/arm/deconv_2d_neon_4x4.cc b/mace/kernels/arm/deconv_2d_neon_4x4.cc index 719a17e34fb1fce03276c59fa26a9f0476f3c75b..dd371ada1d223cdad6928d3e6cde6cf152ad2225 100644 --- a/mace/kernels/arm/deconv_2d_neon_4x4.cc +++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc @@ -20,7 +20,6 @@ namespace kernels { void Deconv2dNeonK4x4S1(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output) { @@ -32,16 +31,12 @@ void Deconv2dNeonK4x4S1(const float *input, const index_t outw = out_shape[3]; const index_t outch = out_shape[1]; const index_t out_img_size = outh * outw; -#pragma omp parallel for +#pragma omp parallel for collapse(2) for (int b = 0; b < out_shape[0]; ++b) { for (int oc = 0; oc < outch; oc += 2) { if (oc + 1 < outch) { float *out_base = output + (b * outch + oc) * out_img_size; float *out_base1 = out_base + out_img_size; - const float bias_value = bias ? bias[oc] : 0.f; - std::fill_n(out_base, out_img_size, bias_value); - const float bias_value1 = bias ? bias[oc + 1] : 0.f; - std::fill_n(out_base1, out_img_size, bias_value1); for (int q = 0; q < inch; q++) { const float *input_base = input + (b * inch + q) * h * w; const float *in = input_base; @@ -257,8 +252,6 @@ void Deconv2dNeonK4x4S1(const float *input, } } else { float *out_base = output + (b * outch + oc) * out_img_size; - const float bias_value = bias ? bias[oc] : 0.f; - std::fill_n(out_base, out_img_size, bias_value); for (int q = 0; q < inch; q++) { const float *input_base = input + (b * inch + q) * h * w; const float *kernel_base = filter + (oc * inch + q) * 16; @@ -381,7 +374,6 @@ void Deconv2dNeonK4x4S1(const float *input, void Deconv2dNeonK4x4S2(const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *out_shape, float *output) { @@ -394,14 +386,11 @@ void Deconv2dNeonK4x4S2(const float *input, const index_t outch = out_shape[1]; const index_t out_img_size = outh * outw; -#pragma omp parallel for +#pragma omp parallel for collapse(3) for (int b = 0; b < out_shape[0]; ++b) { for (int p = 0; p < outch; p++) { - float *out_base = output + (b * outch + p) * out_img_size; - const float bias_value = bias ? bias[p] : 0.f; - std::fill_n(out_base, outh * outw, bias_value); - for (int q = 0; q < inch; q++) { + float *out_base = output + (b * outch + p) * out_img_size; const float *input_base = input + (b * inch + q) * h * w; const float *kernel_base = filter + (p * inch + q) * 16; const float *in = input_base; diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 3656fff8855dbd6001bf9c018c5d3b029acf982a..ab32679aaea4824152395d08d7ee13b26cf5d286 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -184,7 +184,6 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { void Deconv2dGeneral(const float *input, const float *filter, - const float *bias, const index_t kernel_h, const index_t kernel_w, const int *strides, @@ -206,23 +205,25 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { } } -#pragma omp parallel for - for (int b = 0; b < in_shape[0]; ++b) { - for (int oc = 0; oc < out_shape[1]; ++oc) { - float *out_base = - output + (b * out_shape[1] + oc) * out_img_size; - const float bias_value = bias ? bias[oc] : 0.f; - std::fill_n(out_base, out_img_size, bias_value); + const index_t batch = in_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t in_channels = in_shape[1]; + +#pragma omp parallel for collapse(4) + for (int b = 0; b < batch; ++b) { + for (int oc = 0; oc < out_channels; ++oc) { for (int i = 0; i < in_height; ++i) { for (int j = 0; j < in_width; ++j) { + float *out_base = + output + (b * out_channels + oc) * out_img_size; const index_t out_offset = i * strides[0] * out_width + j * strides[1]; - for (int ic = 0; ic < in_shape[1]; ++ic) { + for (int ic = 0; ic < in_channels; ++ic) { const index_t input_idx = - (b * in_shape[1] + ic) * in_img_size + i * in_width + j; + (b * in_channels + ic) * in_img_size + i * in_width + j; const float val = input[input_idx]; const index_t kernel_offset = - (oc * in_shape[1] + ic) * kernel_size; + (oc * in_channels + ic) * kernel_size; for (int k = 0; k < kernel_size; ++k) { const index_t out_idx = out_offset + index_map[k]; const index_t kernel_idx = kernel_offset + k; @@ -248,7 +249,7 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { const index_t out_height = out_shape[2]; const index_t out_width = out_shape[3]; -#pragma omp parallel for +#pragma omp parallel for collapse(3) for (int i = 0; i < batch; ++i) { for (int j = 0; j < channel; ++j) { for (int k = 0; k < out_height; ++k) { @@ -324,7 +325,6 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { "Input/Output batch size mismatch"); std::function deconv_func; @@ -354,6 +354,8 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { scratch->Rewind(); scratch->GrowSize(padded_out_size); Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); + padded_out.Reshape(padded_out_shape); + padded_out.Clear(); auto *padded_out_data = padded_out.mutable_data(); bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && @@ -369,13 +371,11 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { if (use_neon_3x3_s1) { deconv_func = [=](const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *padded_out_shape, float *padded_output) { Deconv2dNeonK3x3S1(input, filter, - bias, in_shape, padded_out_shape, padded_output); @@ -383,13 +383,11 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { } else if (use_neon_3x3_s2) { deconv_func = [=](const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *padded_out_shape, float *padded_output) { Deconv2dNeonK3x3S2(input, filter, - bias, in_shape, padded_out_shape, padded_output); @@ -397,13 +395,11 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { } else if (use_neon_4x4_s1) { deconv_func = [=](const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *padded_out_shape, float *padded_output) { Deconv2dNeonK4x4S1(input, filter, - bias, in_shape, padded_out_shape, padded_output); @@ -411,13 +407,11 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { } else if (use_neon_4x4_s2) { deconv_func = [=](const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *padded_out_shape, float *padded_output) { Deconv2dNeonK4x4S2(input, filter, - bias, in_shape, padded_out_shape, padded_output); @@ -425,13 +419,11 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { } else { deconv_func = [=](const float *input, const float *filter, - const float *bias, const index_t *in_shape, const index_t *padded_out_shape, float *padded_output) { Deconv2dGeneral(input, filter, - bias, kernel_h, kernel_w, strides_.data(), @@ -444,9 +436,24 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { bool no_pad = padded_out_h == output_shape[2] && padded_out_w == output_shape[3]; float *out_data = no_pad ? output_data : padded_out_data; + + if (bias_data != nullptr) { + const index_t batch = output_shape[0]; + const index_t channels = output_shape[1]; + const index_t img_size = output_shape[2] * output_shape[3]; +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t i = 0; i < img_size; ++i) { + output_data[(b * channels + c) * img_size + i] += + bias_data[c]; + } + } + } + } + deconv_func(input_data, filter_data, - bias_data, in_shape, padded_out_shape.data(), out_data); @@ -459,6 +466,8 @@ struct Deconv2dFunctor: Deconv2dFunctorBase { output_data); } + + DoActivation(output_data, output_data, output->size(), diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index 9835bc4d1f9c2f57903980d31415317657abff82..269d90865fc091032f0a9f1f3316653379c8827a 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -120,15 +120,12 @@ static void Deconv2d(int iters, MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); -MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32); -MACE_BM_DECONV_2D(1, 128, 60, 60, 4, 4, 1, 63, 63, VALID, 128); MACE_BM_DECONV_2D(1, 32, 60, 60, 4, 4, 1, 60, 60, SAME, 32); -MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 448, 448, SAME, 32); MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 450, 450, VALID, 32); MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32); MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32); @@ -138,10 +135,10 @@ MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3); MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128); MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128); -MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32); -MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32); MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 448, 448, SAME, 32); +MACE_BM_DECONV_2D(1, 32, 1014, 762, 9, 9, 2, 2035, 1531, VALID, 1); + } // namespace test } // namespace ops } // namespace mace