提交 0436bb8e 编写于 作者: L liuqi

Fix openmp bugs for cpu kernel.

上级 c5d83495
...@@ -51,3 +51,11 @@ config_setting( ...@@ -51,3 +51,11 @@ config_setting(
}, },
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
config_setting(
name = "openmp_enabled",
define_values = {
"openmp": "true",
},
visibility = ["//visibility:public"],
)
# Examples # Examples
load("//mace:mace.bzl", "if_android", "if_neon_enabled") load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_binary( cc_binary(
name = "helloworld", name = "helloworld",
srcs = [ srcs = [
"helloworld.cc", "helloworld.cc",
], ],
linkopts = if_neon_enabled(["-fopenmp"]), linkopts = if_openmp_enabled(["-fopenmp"]),
deps = [ deps = [
"//mace/core", "//mace/core",
"//mace/ops", "//mace/ops",
...@@ -17,7 +17,7 @@ cc_test( ...@@ -17,7 +17,7 @@ cc_test(
name = "benchmark_example", name = "benchmark_example",
testonly = 1, testonly = 1,
srcs = ["benchmark_example.cc"], srcs = ["benchmark_example.cc"],
linkopts = if_neon_enabled(["-fopenmp"]), linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"//mace/core", "//mace/core",
......
...@@ -7,7 +7,7 @@ package( ...@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_neon_enabled") load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_library( cc_library(
name = "kernels", name = "kernels",
...@@ -23,7 +23,7 @@ cc_library( ...@@ -23,7 +23,7 @@ cc_library(
]) + if_neon_enabled(glob([ ]) + if_neon_enabled(glob([
"neon/*.h", "neon/*.h",
])), ])),
copts = if_neon_enabled(["-fopenmp"]), copts = if_openmp_enabled(["-fopenmp"]),
linkopts = if_android(["-lm"]), linkopts = if_android(["-lm"]),
deps = [ deps = [
"//mace/core", "//mace/core",
......
...@@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase { ...@@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase {
} }
} }
index_t pos = 0;
#pragma omp parallel for #pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) { for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
index_t pos = (((n * height) + h) * width + w) * channels + c;
if (folded_constant_) { if (folded_constant_) {
output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c]; output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
} else { } else {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
} }
++pos;
} }
} }
} }
......
...@@ -33,14 +33,13 @@ struct BiasAddFunctor { ...@@ -33,14 +33,13 @@ struct BiasAddFunctor {
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
index_t pos = 0; #pragma omp parallel for collapse(4)
#pragma omp parallel for
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) { for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
index_t pos = (((n * height) + h) * width + w) * channels + c;
output_ptr[pos] = input_ptr[pos] + bias_ptr[c]; output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
++pos;
} }
} }
} }
......
...@@ -103,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -103,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase {
auto bias_data = bias == nullptr ? nullptr : bias->data<T>(); auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>(); auto output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(4)
for (int n = 0; n < batch; ++n) { for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
const int out_idx = ((n * height + h) * width + w) * channels + c;
T bias_channel = 0.0f; T bias_channel = 0.0f;
if (bias) bias_channel = bias_data[c]; if (bias) bias_channel = bias_data[c];
*output_data = bias_channel; output_data[out_idx] = bias_channel;
T sum = 0.0f; T sum = 0.0f;
const T *filter_ptr = filter_data + c; const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -123,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -123,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
inw >= padded_w_start && inw < padded_w_stop, inw >= padded_w_start && inw < padded_w_stop,
"Out of range read from input: ", inh, ", ", "Out of range read from input: ", inh, ", ",
inw); inw);
// else padding with 0:
// sum += 0;
} else { } else {
index_t input_offset = index_t input_offset =
n * input_height * input_width * input_channels + n * input_height * input_width * input_channels +
...@@ -136,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -136,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase {
} }
} }
} }
*output_data += sum; output_data[out_idx] += sum;
output_data++;
} }
} }
} }
} }
output_data = output->mutable_data<T>();
DoActivation(output_data, output_data, output->NumElements(), activation_, DoActivation(output_data, output_data, output->NumElements(), activation_,
relux_max_limit_, prelu_alpha_); relux_max_limit_, prelu_alpha_);
} }
......
...@@ -114,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { ...@@ -114,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>(); const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(4)
for (int n = 0; n < batch; ++n) { for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
......
...@@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
{1, kwg_size / 32, 32, 1}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1}, {1, kwg_size / 128, 128, 1},
{4, kwg_size / 16, 4, 1},
{4, kwg_size / 28, 7, 1},
{4, kwg_size / 32, 8, 1},
{4, kwg_size / 56, 14, 1},
{3, 15, 9, 1}, {3, 15, 9, 1},
{7, 15, 9, 1}, {7, 15, 9, 1},
{9, 7, 15, 1}, {9, 7, 15, 1},
......
...@@ -96,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -96,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase {
int padded_w_start = 0 - paddings[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
if (pooling_type_ == MAX) { if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(4)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c; index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest(); T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
...@@ -114,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -114,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase {
} }
} }
} }
*output = res; output[out_offset] = res;
output++;
} }
} }
} }
} }
} else if (pooling_type_ == AVG) { } else if (pooling_type_ == AVG) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(4)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c; index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0; T sum = 0;
int block_size = 0; int block_size = 0;
...@@ -141,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase { ...@@ -141,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase {
} }
} }
} }
*output = sum / block_size; output[out_offset] = sum / block_size;
output++;
} }
} }
} }
......
...@@ -29,21 +29,20 @@ struct SoftmaxFunctor { ...@@ -29,21 +29,20 @@ struct SoftmaxFunctor {
const index_t num_classes = logits_shape.back(); const index_t num_classes = logits_shape.back();
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < batch_size; ++i) { for (index_t i = 0; i < batch_size; ++i) {
T max_value = *logits_ptr; const index_t pos = i * num_classes;
T max_value = logits_ptr[pos];
for (index_t c = 1; c < num_classes; ++c) { for (index_t c = 1; c < num_classes; ++c) {
max_value = std::max(max_value, logits_ptr[c]); max_value = std::max(max_value, logits_ptr[pos + c]);
} }
// TODO: check overflow? // TODO: check overflow?
T sum = 0; T sum = 0;
std::vector<T> exp_data(num_classes); std::vector<T> exp_data(num_classes);
for (index_t c = 0; c < num_classes; ++c) { for (index_t c = 0; c < num_classes; ++c) {
exp_data[c] = ::exp((*logits_ptr - max_value)); exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
sum += exp_data[c]; sum += exp_data[c];
logits_ptr++;
} }
for (index_t c = 0; c < num_classes; ++c) { for (index_t c = 0; c < num_classes; ++c) {
*output_ptr = exp_data[c] / sum; output_ptr[pos + c] = exp_data[c] / sum;
output_ptr++;
} }
} }
} }
......
...@@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a): ...@@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a):
"//mace:hexagon_enabled": [], "//mace:hexagon_enabled": [],
"//conditions:default": a, "//conditions:default": a,
}) })
def if_openmp_enabled(a):
return select({
"//mace:openmp_enabled": a,
"//conditions:default": [],
})
...@@ -7,7 +7,7 @@ package( ...@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_neon_enabled") load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_library( cc_library(
name = "test", name = "test",
...@@ -34,7 +34,7 @@ cc_library( ...@@ -34,7 +34,7 @@ cc_library(
["*.h"], ["*.h"],
exclude = ["ops_test_util.h"], exclude = ["ops_test_util.h"],
), ),
copts = if_neon_enabled(["-DMACE_ENABLE_NEON"]), copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
deps = [ deps = [
"//mace/kernels", "//mace/kernels",
], ],
......
...@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \ ...@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-DMACE_DISABLE_NO_TUNING_WARNING" \ --copt="-DMACE_DISABLE_NO_TUNING_WARNING" \
--copt="-Werror=return-type" \ --copt="-Werror=return-type" \
--define neon=false --define neon=false \
--define openmp=true
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
exit 1 exit 1
......
...@@ -55,7 +55,8 @@ build_target() ...@@ -55,7 +55,8 @@ build_target()
--copt="-std=c++11" \ --copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \ --copt="-Werror=return-type" \
--copt="-DMACE_OBFUSCATE_LITERALS" || exit -1 --copt="-DMACE_OBFUSCATE_LITERALS" \
--define openmp=true || exit -1
} }
merge_libs() merge_libs()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册