From 0436bb8e70aa2408618a3ab934d7d0e1baddf3b6 Mon Sep 17 00:00:00 2001 From: liuqi Date: Wed, 31 Jan 2018 19:54:49 +0800 Subject: [PATCH] Fix openmp bugs for cpu kernel. --- mace/BUILD | 8 ++++++++ mace/examples/BUILD | 6 +++--- mace/kernels/BUILD | 4 ++-- mace/kernels/batch_norm.h | 5 ++--- mace/kernels/bias_add.h | 5 ++--- mace/kernels/conv_2d.h | 10 ++++------ mace/kernels/depthwise_conv2d.h | 2 +- mace/kernels/opencl/helper.cc | 4 ++++ mace/kernels/pooling.h | 12 ++++++------ mace/kernels/softmax.h | 11 +++++------ mace/mace.bzl | 6 ++++++ mace/ops/BUILD | 4 ++-- tools/bazel-adb-run.sh | 3 ++- tools/export_local_lib.sh | 3 ++- 14 files changed, 49 insertions(+), 34 deletions(-) diff --git a/mace/BUILD b/mace/BUILD index 98a9ab05..bebe0e9d 100644 --- a/mace/BUILD +++ b/mace/BUILD @@ -51,3 +51,11 @@ config_setting( }, visibility = ["//visibility:public"], ) + +config_setting( + name = "openmp_enabled", + define_values = { + "openmp": "true", + }, + visibility = ["//visibility:public"], +) diff --git a/mace/examples/BUILD b/mace/examples/BUILD index 233b59f1..ff47e1d9 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -1,12 +1,12 @@ # Examples -load("//mace:mace.bzl", "if_android", "if_neon_enabled") +load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled") cc_binary( name = "helloworld", srcs = [ "helloworld.cc", ], - linkopts = if_neon_enabled(["-fopenmp"]), + linkopts = if_openmp_enabled(["-fopenmp"]), deps = [ "//mace/core", "//mace/ops", @@ -17,7 +17,7 @@ cc_test( name = "benchmark_example", testonly = 1, srcs = ["benchmark_example.cc"], - linkopts = if_neon_enabled(["-fopenmp"]), + linkopts = if_openmp_enabled(["-fopenmp"]), linkstatic = 1, deps = [ "//mace/core", diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index a4646d90..ba1b601f 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -7,7 +7,7 @@ package( licenses(["notice"]) # Apache 2.0 -load("//mace:mace.bzl", "if_android", "if_neon_enabled") +load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled") cc_library( name = "kernels", @@ -23,7 +23,7 @@ cc_library( ]) + if_neon_enabled(glob([ "neon/*.h", ])), - copts = if_neon_enabled(["-fopenmp"]), + copts = if_openmp_enabled(["-fopenmp"]), linkopts = if_android(["-lm"]), deps = [ "//mace/core", diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index bd8fc7e9..6f16bf6f 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase { } } - index_t pos = 0; -#pragma omp parallel for +#pragma omp parallel for collapse(4) for (index_t n = 0; n < batch; ++n) { for (index_t h = 0; h < height; ++h) { for (index_t w = 0; w < width; ++w) { for (index_t c = 0; c < channels; ++c) { + index_t pos = (((n * height) + h) * width + w) * channels + c; if (folded_constant_) { output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c]; } else { output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; } - ++pos; } } } diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index 5b87026d..1e7f6dc8 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -33,14 +33,13 @@ struct BiasAddFunctor { T *output_ptr = output->mutable_data(); - index_t pos = 0; -#pragma omp parallel for +#pragma omp parallel for collapse(4) for (index_t n = 0; n < batch; ++n) { for (index_t h = 0; h < height; ++h) { for (index_t w = 0; w < width; ++w) { for (index_t c = 0; c < channels; ++c) { + index_t pos = (((n * height) + h) * width + w) * channels + c; output_ptr[pos] = input_ptr[pos] + bias_ptr[c]; - ++pos; } } } diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 8da36579..cc331a17 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -103,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase { auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); +#pragma omp parallel for collapse(4) for (int n = 0; n < batch; ++n) { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { for (int c = 0; c < channels; ++c) { + const int out_idx = ((n * height + h) * width + w) * channels + c; T bias_channel = 0.0f; if (bias) bias_channel = bias_data[c]; - *output_data = bias_channel; + output_data[out_idx] = bias_channel; T sum = 0.0f; const T *filter_ptr = filter_data + c; for (int kh = 0; kh < kernel_h; ++kh) { @@ -123,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { inw >= padded_w_start && inw < padded_w_stop, "Out of range read from input: ", inh, ", ", inw); - // else padding with 0: - // sum += 0; } else { index_t input_offset = n * input_height * input_width * input_channels + @@ -136,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase { } } } - *output_data += sum; - output_data++; + output_data[out_idx] += sum; } } } } - output_data = output->mutable_data(); DoActivation(output_data, output_data, output->NumElements(), activation_, relux_max_limit_, prelu_alpha_); } diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index cdf915c0..39579724 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -114,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { const T *bias_ptr = bias == nullptr ? nullptr : bias->data(); T *output_ptr = output->mutable_data(); -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(4) for (int n = 0; n < batch; ++n) { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 451a19d0..cc9cfee9 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, {1, kwg_size / 32, 32, 1}, {1, kwg_size / 64, 64, 1}, {1, kwg_size / 128, 128, 1}, + {4, kwg_size / 16, 4, 1}, + {4, kwg_size / 28, 7, 1}, + {4, kwg_size / 32, 8, 1}, + {4, kwg_size / 56, 14, 1}, {3, 15, 9, 1}, {7, 15, 9, 1}, {9, 7, 15, 1}, diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index dcf1c53a..d9a28c54 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -96,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase { int padded_w_start = 0 - paddings[1] / 2; if (pooling_type_ == MAX) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(4) for (int b = 0; b < batch; ++b) { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { for (int c = 0; c < channels; ++c) { + index_t out_offset = (((b * height) + h) * width + w) * channels + c; index_t in_offset = b * in_image_size * input_channels + c; T res = std::numeric_limits::lowest(); for (int kh = 0; kh < kernel_h; ++kh) { @@ -114,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase { } } } - *output = res; - output++; + output[out_offset] = res; } } } } } else if (pooling_type_ == AVG) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(4) for (int b = 0; b < batch; ++b) { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { for (int c = 0; c < channels; ++c) { + index_t out_offset = (((b * height) + h) * width + w) * channels + c; index_t in_offset = b * in_image_size * input_channels + c; T sum = 0; int block_size = 0; @@ -141,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase { } } } - *output = sum / block_size; - output++; + output[out_offset] = sum / block_size; } } } diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index b29514a2..2e5bc495 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -29,21 +29,20 @@ struct SoftmaxFunctor { const index_t num_classes = logits_shape.back(); #pragma omp parallel for for (index_t i = 0; i < batch_size; ++i) { - T max_value = *logits_ptr; + const index_t pos = i * num_classes; + T max_value = logits_ptr[pos]; for (index_t c = 1; c < num_classes; ++c) { - max_value = std::max(max_value, logits_ptr[c]); + max_value = std::max(max_value, logits_ptr[pos + c]); } // TODO: check overflow? T sum = 0; std::vector exp_data(num_classes); for (index_t c = 0; c < num_classes; ++c) { - exp_data[c] = ::exp((*logits_ptr - max_value)); + exp_data[c] = ::exp((logits_ptr[pos + c] - max_value)); sum += exp_data[c]; - logits_ptr++; } for (index_t c = 0; c < num_classes; ++c) { - *output_ptr = exp_data[c] / sum; - output_ptr++; + output_ptr[pos + c] = exp_data[c] / sum; } } } diff --git a/mace/mace.bzl b/mace/mace.bzl index e9002885..3db0ff5c 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a): "//mace:hexagon_enabled": [], "//conditions:default": a, }) + +def if_openmp_enabled(a): + return select({ + "//mace:openmp_enabled": a, + "//conditions:default": [], + }) diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 442441e0..dbe6e5e2 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -7,7 +7,7 @@ package( licenses(["notice"]) # Apache 2.0 -load("//mace:mace.bzl", "if_android", "if_neon_enabled") +load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled") cc_library( name = "test", @@ -34,7 +34,7 @@ cc_library( ["*.h"], exclude = ["ops_test_util.h"], ), - copts = if_neon_enabled(["-DMACE_ENABLE_NEON"]), + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]), deps = [ "//mace/kernels", ], diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh index 9c9033b1..6e1e68f2 100755 --- a/tools/bazel-adb-run.sh +++ b/tools/bazel-adb-run.sh @@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-DMACE_DISABLE_NO_TUNING_WARNING" \ --copt="-Werror=return-type" \ - --define neon=false + --define neon=false \ + --define openmp=true if [ $? -ne 0 ]; then exit 1 diff --git a/tools/export_local_lib.sh b/tools/export_local_lib.sh index 26aa3fa4..f5821e54 100755 --- a/tools/export_local_lib.sh +++ b/tools/export_local_lib.sh @@ -55,7 +55,8 @@ build_target() --copt="-std=c++11" \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-Werror=return-type" \ - --copt="-DMACE_OBFUSCATE_LITERALS" || exit -1 + --copt="-DMACE_OBFUSCATE_LITERALS" \ + --define openmp=true || exit -1 } merge_libs() -- GitLab