diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index a2b4c5cbe34a550d40224afb9e6c2d77aeacb980..bb2cea3b55dba1d32debfb6d30333f93259b2614 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -8,11 +8,9 @@ #include #include -#include #include "mace/core/logging.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/core/runtime/opencl/opencl_wrapper.h" namespace mace { namespace { @@ -66,7 +64,7 @@ bool BuildProgram(OpenCLRuntime *runtime, }; *program = cl::Program(runtime->context(), sources); - std::string build_options = "-Werror -cl-mad-enable -I" + path; + std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path; // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) { if (program->getBuildInfo(runtime->device()) == diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 057b2a80320130d322a1146698cfcb40334ca0a4..e18c9f93dae38ff8307adecbf321e65252d47427 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -20,15 +20,18 @@ namespace mace { class OpenCLRuntime { public: static OpenCLRuntime *Get(); - OpenCLRuntime(cl::Context context, - cl::Device device, - cl::CommandQueue command_queue); - ~OpenCLRuntime(); cl::Context &context(); cl::Device &device(); cl::CommandQueue &command_queue(); cl::Program &program(); + private: + OpenCLRuntime(cl::Context context, + cl::Device device, + cl::CommandQueue command_queue); + ~OpenCLRuntime(); + OpenCLRuntime(const OpenCLRuntime&) = delete; + OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; private: cl::Context context_; diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 5c838be47b1ac118266a64197fd2a462eadaa433..cd3fb4b9bc47b31d82b7f0d053f356f3c1c62f6a 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -13,16 +13,13 @@ namespace kernels { template struct BatchNormFunctor { - void operator()(const T *input, - const T *scale, - const T *offset, - const T *mean, - const T *var, - const float variance_epsilon, - const index_t n, - const index_t channel, - const index_t sample_size, - T *output) { + void operator()(const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output) { // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . // The calculation formula for inference is // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + @@ -31,16 +28,35 @@ struct BatchNormFunctor { // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } // new_offset = \offset - mean * common_val; // Y = new_scale * X + new_offset; - T new_scale, new_offset; + const index_t n = input->dim(0); + const index_t channel = input->dim(1); + const index_t sample_size = input->dim(2) * input->dim(3); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard scale_mapper(scale); + Tensor::MappingGuard offset_mapper(offset); + Tensor::MappingGuard mean_mapper(mean); + Tensor::MappingGuard var_mapper(var); + Tensor::MappingGuard epsilon_mapper(epsilon); + Tensor::MappingGuard output_mapper(output); + + const T *input_ptr = input->data(); + const T *scale_ptr = scale->data(); + const T *offset_ptr = offset->data(); + const T *mean_ptr = mean->data(); + const T *var_ptr = var->data(); + const T *epsilon_ptr = epsilon->data(); + T *output_ptr = output->mutable_data(); + #pragma omp parallel for for (index_t c = 0; c < channel; ++c) { - new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon); - new_offset = offset[c] - mean[c] * new_scale; + T new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr); + T new_offset = offset_ptr[c] - mean_ptr[c] * new_scale; index_t pos = c * sample_size; for (index_t i = 0; i < n; ++i) { - const T *input_sample_ptr = input + pos; - T *output_sample_ptr = output + pos; + const T *input_sample_ptr = input_ptr + pos; + T *output_sample_ptr = output_ptr + pos; for (index_t j = 0; j < sample_size; ++j) { output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset; } @@ -52,16 +68,23 @@ struct BatchNormFunctor { template <> void BatchNormFunctor::operator()( - const float *input, - const float *scale, - const float *offset, - const float *mean, - const float *var, - const float variance_epsilon, - const index_t n, - const index_t channel, - const index_t sample_size, - float *output); + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output); + +template <> +void BatchNormFunctor::operator()( + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output); } // namepsace kernels } // namespace mace diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc index cd5fff22a8e608c9c59f986e1f9ee980fb944e4d..295cc59d48d44fe385ae7d86564674d5d8eecc78 100644 --- a/mace/kernels/neon/batch_norm_neon.cc +++ b/mace/kernels/neon/batch_norm_neon.cc @@ -10,38 +10,46 @@ namespace kernels { template <> void BatchNormFunctor::operator()( - const float *input, - const float *scale, - const float *offset, - const float *mean, - const float *var, - const float variance_epsilon, - const index_t n, - const index_t channel, - const index_t sample_size, - float *output) { + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output) { // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . // The calculation formula for inference is - // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + - // ( \offset - \frac { \scale * mean } { \sqrt{var+\variance_epsilon} + // Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X + + // ( \offset - \frac { \scale * mean } { \sqrt{var+\epsilon} // } - // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } + // new_scale = \frac{ \scale } { \sqrt{var+\epsilon} } // new_offset = \offset - mean * common_val; // Y = new_scale * X + new_offset; - float new_scale, new_offset; + const index_t n = input->dim(0); + const index_t channel = input->dim(1); + const index_t sample_size = input->dim(2) * input->dim(3); + + const float *input_ptr = input->data(); + const float *scale_ptr = scale->data(); + const float *offset_ptr = offset->data(); + const float *mean_ptr = mean->data(); + const float *var_ptr = var->data(); + const float *epsilon_ptr = epsilon->data(); + float *output_ptr = output->mutable_data(); + index_t count = sample_size >> 2; index_t remain_count = sample_size - (count << 2); #pragma omp parallel for for (index_t c = 0; c < channel; ++c) { - new_scale = scale[c] / std::sqrt(var[c] + variance_epsilon); - new_offset = offset[c] - mean[c] * new_scale; + float new_scale = scale_ptr[c] / std::sqrt(var_ptr[c] + *epsilon_ptr); + float new_offset = offset_ptr[c] - mean_ptr[c] * new_scale; index_t pos = c * sample_size; float32x4_t new_scale_f = vdupq_n_f32(new_scale); float32x4_t new_offset_f = vdupq_n_f32(new_offset); for (index_t i = 0; i < n; ++i) { - const float *input_sample_ptr = input + pos; - float *output_sample_ptr = output + pos; + const float *input_sample_ptr = input_ptr + pos; + float *output_sample_ptr = output_ptr + pos; for (index_t j = 0; j < count; ++j) { float32x4_t input_f = vld1q_f32(input_sample_ptr); diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc new file mode 100644 index 0000000000000000000000000000000000000000..67b810f5149ea51e2456071415857bca467abbf0 --- /dev/null +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -0,0 +1,46 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/batch_norm.h" +#include "mace/core/runtime/opencl/cl2.hpp" +#include "mace/core/runtime/opencl/opencl_runtime.h" + +namespace mace { +namespace kernels { + +template <> +void BatchNormFunctor::operator()( + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output) { + const index_t n = input->dim(0); + const index_t channel = input->dim(1); + const index_t sample_size = input->dim(2) * input->dim(3); + + auto runtime = OpenCLRuntime::Get(); + auto program = runtime->program(); + auto _kernel = cl::Kernel(program, "batch_norm"); + _kernel.setArg(0, *(static_cast(input->buffer()))); + _kernel.setArg(1, *(static_cast(scale->buffer()))); + _kernel.setArg(2, *(static_cast(offset->buffer()))); + _kernel.setArg(3, *(static_cast(mean->buffer()))); + _kernel.setArg(4, *(static_cast(var->buffer()))); + _kernel.setArg(5, *(static_cast(epsilon->buffer()))); + _kernel.setArg(6, static_cast(sample_size)); + _kernel.setArg(7, *(static_cast(output->buffer()))); + _kernel.setArg(8, 32u, nullptr); + _kernel.setArg(9, 32u, nullptr); + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + _kernel, cl::NullRange, + cl::NDRange(n, channel, sample_size), + cl::NDRange(1, 1, 128)); + MACE_CHECK(error == CL_SUCCESS); +} + +} // namespace kernels +} // namespace mace \ No newline at end of file diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl new file mode 100644 index 0000000000000000000000000000000000000000..d5927071f222b6ff0c0cdb7dd32e2b15979983f8 --- /dev/null +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -0,0 +1,31 @@ +void kernel batch_norm(global const float *input, + global const float *scale, + global const float *offset, + global const float *mean, + global const float *var, + global const float *epsilon, + private const int pixels, + global float *output, + __local float *new_scale, + __local float *new_offset) { + const int batch = get_global_id(0); + const int channel = get_global_id(1); + const int channels = get_global_size(1); + const int pixel_offset = get_global_id(2); + const unsigned int local_channel = get_local_id(1); + const int local_pixel_idx = get_local_id(2); + + if(local_pixel_idx == 0) { + new_scale[local_channel] = scale[channel] * rsqrt(var[channel] + *epsilon); + new_offset[local_channel] = offset[channel] - mean[channel] * new_scale[local_channel]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const int sample_offset = (batch * channels + channel) * pixels + pixel_offset; + + const float *input_ptr = input + sample_offset; + float *output_ptr = output + sample_offset; + *output_ptr = new_scale[local_channel] * *input_ptr + new_offset[local_channel]; +} + diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 83574b53373fd52226aef06ebd80f392131c6732..e823136d965670cc198ed14c0f682cbf0b152d00 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -17,6 +17,7 @@ cc_library( ], deps = [ "//mace/core", + "//mace/core:opencl_runtime", "@gtest//:gtest", ], ) @@ -39,7 +40,6 @@ cc_library( "-fopenmp", ], deps = [ - "//mace/core", "//mace/kernels", "//mace/proto:cc_proto", ], @@ -72,7 +72,6 @@ cc_test( deps = [ ":ops", ":test", - "//mace/core", "//mace/core:test_benchmark_main", ], ) diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index f5b050f1ef1f4910eb9b3366735968daf99b7630..1ce9b1e090bbf171bbe3ff33c07512af12e94c80 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(BatchNorm, BatchNormOp); REGISTER_NEON_OPERATOR(BatchNorm, BatchNormOp); #endif // __ARM_NEON +REGISTER_OPENCL_OPERATOR(BatchNorm, BatchNormOp); + } // namespace mace \ No newline at end of file diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index a7292601315c1db3d4d97659a154ad0bf91d474c..1452bc726c4807b079bb6dcb24b4319275059bc1 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -40,20 +40,7 @@ class BatchNormOp : public Operator { Tensor *output = this->Output(0); output->ResizeLike(input); - const index_t n = input->dim(0); - const index_t channel = input->dim(1); - const index_t sample_size = input->dim(2) * input->dim(3); - - const T *input_ptr = input->data(); - const T *scale_ptr = scale->data(); - const T *offset_ptr = offset->data(); - const T *mean_ptr = mean->data(); - const T *var_ptr = var->data(); - const T *epsilon_ptr = epsilon->data(); - T *output_ptr = output->mutable_data(); - - functor_(input_ptr, scale_ptr, offset_ptr, mean_ptr, var_ptr, *epsilon_ptr, - n, channel, sample_size, output_ptr); + functor_(input, scale, offset, mean, var, epsilon, output); return true; } diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 8fc2479794961001c18aac4b69f3ac767c78425c..3d17aca7ec153060629a551e4eba3829c22d4529 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -24,21 +24,23 @@ static void BatchNorm( .Finalize(net.operator_def()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}, true); - net.AddInputFromArray("Epsilon", {}, {1e-3}); + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}, true); + net.AddInputFromArray("Epsilon", {}, {1e-3}); // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); + net.Sync(); } mace::testing::StartTiming(); while (iters--) { net.RunOp(D); + net.Sync(); } } @@ -54,7 +56,8 @@ static void BatchNorm( #define BM_BATCH_NORM(N, C, H, W, TYPE) \ BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON); + BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON); \ + BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, OPENCL); BM_BATCH_NORM(1, 1, 512, 512, float); BM_BATCH_NORM(1, 3, 128, 128, float); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 9933877856754643334ebd0522bc505843a2814e..4c5d73bbe3981ec2af972a5c370ec2794c22ac2d 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -9,9 +9,10 @@ namespace mace { class BatchNormOpTest : public OpsTestBase {}; -TEST_F(BatchNormOpTest, SimpleCPU) { +template +void Simple() { // Construct graph - auto &net = test_net(); + OpsTestNet net; OpDefBuilder("BatchNorm", "BatchNormTest") .Input("Input") .Input("Scale") @@ -23,26 +24,79 @@ TEST_F(BatchNormOpTest, SimpleCPU) { .Finalize(net.operator_def()); // Add input data - net.AddInputFromArray("Input", {1, 1, 6, 2}, + net.AddInputFromArray("Input", {1, 1, 6, 2}, {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); - net.AddInputFromArray("Scale", {1}, {4.0f}); - net.AddInputFromArray("Offset", {1}, {2.0}); - net.AddInputFromArray("Mean", {1}, {10}); - net.AddInputFromArray("Var", {1}, {11.67f}); - net.AddInputFromArray("Epsilon", {}, {1e-3}); + net.AddInputFromArray("Scale", {1}, {4.0f}); + net.AddInputFromArray("Offset", {1}, {2.0}); + net.AddInputFromArray("Mean", {1}, {10}); + net.AddInputFromArray("Var", {1}, {11.67f}); + net.AddInputFromArray("Epsilon", {}, {1e-3}); // Run - net.RunOp(); + net.RunOp(D); // Check auto expected = CreateTensor({1, 1, 6, 2}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, 3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); +} + +TEST_F(BatchNormOpTest, SimpleCPU) { + Simple(); +} + +TEST_F(BatchNormOpTest, SimpleNEON) { + Simple(); +} + +TEST_F(BatchNormOpTest, SimpleOPENCL) { + Simple(); } -TEST_F(BatchNormOpTest, SimpleNeon) { +TEST_F(BatchNormOpTest, SimpleRandomNeon) { + srand(time(NULL)); + + // generate random input + index_t batch = 1 + rand() % 10; + index_t channels = 3 + rand() % 50; + index_t height = 64; + index_t width = 64; + // Construct graph + auto &net = test_net(); + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Input("Epsilon") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}, true); + net.AddInputFromArray("Epsilon", {}, {1e-3}); + + // run cpu + net.RunOp(); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Run NEON + net.RunOp(DeviceType::NEON); + + ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); +} + +TEST_F(BatchNormOpTest, ComplexRandomNeon) { srand(time(NULL)); // generate random input @@ -74,11 +128,96 @@ TEST_F(BatchNormOpTest, SimpleNeon) { net.RunOp(); // Check - Tensor *expected = net.GetOutput("Output"); + Tensor expected; + expected.Copy(*net.GetOutput("Output")); // Run NEON net.RunOp(DeviceType::NEON); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); } + +TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { + srand(time(NULL)); + + // generate random input + index_t batch = 1 + rand() % 10; + index_t channels = 3 + rand() % 50; + index_t height = 64; + index_t width = 64; + // Construct graph + auto &net = test_net(); + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Input("Epsilon") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}, true); + net.AddInputFromArray("Epsilon", {}, {1e-3}); + + // Run NEON + net.RunOp(DeviceType::OPENCL); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run cpu + net.RunOp(); + + ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); +} + +TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { + srand(time(NULL)); + + // generate random input + index_t batch = 1 + rand() % 10; + index_t channels = 3 + rand() % 50; + index_t height = 103; + index_t width = 113; + // Construct graph + auto &net = test_net(); + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Input("Epsilon") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}, true); + net.AddInputFromArray("Epsilon", {}, {1e-3}); + + // Run NEON + net.RunOp(DeviceType::OPENCL); + net.Sync(); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run cpu + net.RunOp(); + + ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2); +} + } diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 3b2ddfe0f69dcac9f624ff67907ad640a4f5505a..678f855fc7b98b5c7ec66f13e07e75ab121e067e 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -11,6 +11,7 @@ #include "mace/core/common.h" #include "mace/core/net.h" #include "mace/core/tensor.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { @@ -152,6 +153,12 @@ class OpsTestNet { return ws_.GetTensor(output_name); } + void Sync() { + if (net_) { + OpenCLRuntime::Get()->command_queue().finish(); + } + } + public: Workspace ws_; OperatorDef op_def_;