diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl index 33fe65d00b2496315b86cde094cd4459f17c3a51..17d391658e367e7b862ff713c177766a70e203f3 100644 --- a/mace/kernels/opencl/cl/relu.cl +++ b/mace/kernels/opencl/cl/relu.cl @@ -1,35 +1,29 @@ #include // Supported data type: half/float -__kernel void relu(__global const DATA_TYPE *input, - __private const int size, - __global DATA_TYPE *output) { - int idx = get_global_id(0); +__kernel void relu(__read_only image2d_t input, + __write_only image2d_t output) { + const int ch_blk = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); + const int width = get_global_size(1); - if (idx + 4 > size) { - for(; idx < size; ++idx) { - *(output+idx) = fmax(*(input+idx), 0); - } - } else { - VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input); - data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0); - vstore4(data, idx, output); - } + const int pos = ch_blk * width + w; + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); + DATA_TYPE4 out = fmax(in, (DATA_TYPE4)0); + WRITE_IMAGET(output, (int2)(pos, hb), out); } -__kernel void relux(__global const DATA_TYPE *input, +__kernel void relux(__read_only image2d_t input, __private const DATA_TYPE max_limit, - __private const int size, - __global DATA_TYPE *output) { - int idx = get_global_id(0); + __write_only image2d_t output) { + const int ch_blk = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); + const int width = get_global_size(1); - if (idx + 4 > size) { - for(; idx < size; ++idx) { - *(output+idx) = clamp(*(input+idx), 0.0f, max_limit); - } - } else { - VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input); - data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit); - vstore4(data, idx, output); - } + const int pos = ch_blk * width + w; + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); + DATA_TYPE4 out = clamp(in, (DATA_TYPE4)0, (DATA_TYPE4)max_limit); + WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc index 1149b965a2fc91c5394c97b7028d872b827dc125..46988793525883d1305c33a6cf548f70f58c2025 100644 --- a/mace/kernels/opencl/relu_opencl.cc +++ b/mace/kernels/opencl/relu_opencl.cc @@ -6,58 +6,70 @@ #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { -template <> -void ReluFunctor::operator()(const Tensor *input, +template +void ReluFunctor::operator()(const Tensor *input, Tensor *output) { - index_t element_size = input->NumElements(); - index_t blocks = (element_size + 3) / 4; + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); - const uint32_t gws = blocks; + const index_t channel_blocks = RoundUpDiv4(channels); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + auto dt = DataTypeToEnum::value; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); if (max_limit_ < 0) { auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options); - const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); + const uint32_t lws[3] = {1, kwg_size, 1}; uint32_t idx = 0; relu_kernel.setArg(idx++, *(static_cast(input->buffer()))); - relu_kernel.setArg(idx++, static_cast(element_size)); relu_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( relu_kernel, cl::NullRange, - cl::NDRange(gws), - cl::NDRange(lws), + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } else { auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options); - - const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); + const uint32_t lws[3] = {1, kwg_size, 1}; uint32_t idx = 0; relu_kernel.setArg(idx++, *(static_cast(input->buffer()))); relu_kernel.setArg(idx++, max_limit_); - relu_kernel.setArg(idx++, static_cast(element_size)); relu_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( relu_kernel, cl::NullRange, - cl::NDRange(gws), - cl::NDRange(lws), + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } } +template +struct ReluFunctor; +template +struct ReluFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h index 347d586bee77a4c3948d032919ce1bdf1970e945..ea94c79a96ad523653ec0defd53902af3ac698f6 100644 --- a/mace/kernels/relu.h +++ b/mace/kernels/relu.h @@ -33,11 +33,15 @@ struct ReluFunctor { template <> void ReluFunctor::operator()(const Tensor *input, Tensor *output); -template <> -void ReluFunctor::operator()(const Tensor *input, - Tensor *output); + +template +struct ReluFunctor { + T max_limit_; + + void operator()(const Tensor *input, Tensor *output); +}; } // namespace kernels } // namespace mace -#endif // MACE_KERNELS_RELU_H_ \ No newline at end of file +#endif // MACE_KERNELS_RELU_H_ diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc index c86fb38f1f9a56f8d0721593e48e1bfb4b67db05..3365439398af6d1aded3d1f28304958da097b7ab 100644 --- a/mace/ops/relu.cc +++ b/mace/ops/relu.cc @@ -12,5 +12,14 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp); REGISTER_NEON_OPERATOR(Relu, ReluOp); #endif // __ARM_NEON -REGISTER_OPENCL_OPERATOR(Relu, ReluOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu") + .TypeConstraint("T") + .Build(), + ReluOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu") + .TypeConstraint("T") + .Build(), + ReluOp); + } // namespace mace diff --git a/mace/ops/relu.h b/mace/ops/relu.h index fea49c8da843ce1f6cc3010209b2d8697963342a..613776a870b736b9a2166339301674bf484fe4fe 100644 --- a/mace/ops/relu.h +++ b/mace/ops/relu.h @@ -16,7 +16,7 @@ class ReluOp : public Operator { ReluOp(const OperatorDef &operator_def, Workspace *ws) : Operator(operator_def, ws) { functor_.max_limit_ = - OperatorBase::GetSingleArgument("max_limit", static_cast(-1)); + OperatorBase::GetSingleArgument("max_limit", static_cast(-1)); } bool Run() override { const Tensor *input_tensor = this->inputs_[0]; diff --git a/mace/ops/relu_benchmark.cc b/mace/ops/relu_benchmark.cc index 14badcd9f98083b3b50f93d672cfc4a53d692b27..c68009c9cb1c6bd29806078c69ab47ff005689fa 100644 --- a/mace/ops/relu_benchmark.cc +++ b/mace/ops/relu_benchmark.cc @@ -9,17 +9,28 @@ namespace mace { template -static void ReluBenchmark(int iters, int size) { +static void ReluBenchmark( + int iters, int batch, int channels, int height, int width) { mace::testing::StopTiming(); OpsTestNet net; - OpDefBuilder("Relu", "ReluBM") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {size}); + net.AddRandomInput("Input", {batch, height, width, channels}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + + OpDefBuilder("Relu", "ReluBM") + .Input("InputImage") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Relu", "ReluBM") + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } // Warm-up for (int i = 0; i < 5; ++i) { @@ -34,21 +45,23 @@ static void ReluBenchmark(int iters, int size) { net.Sync(); } -#define BM_RELU_MACRO(SIZE, TYPE, DEVICE) \ - static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(int iters) { \ - const int64_t tot = static_cast(iters) * SIZE; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - ReluBenchmark(iters, SIZE); \ - } \ - BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE) - -#define BM_RELU(SIZE, TYPE) \ - BM_RELU_MACRO(SIZE, TYPE, CPU); \ - BM_RELU_MACRO(SIZE, TYPE, NEON);\ - BM_RELU_MACRO(SIZE, TYPE, OPENCL); - -BM_RELU(1000, float); -BM_RELU(100000, float); -BM_RELU(10000000, float); -} // namespace mace \ No newline at end of file +#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + ReluBenchmark(iters, N, C, H, W); \ + } \ + BENCHMARK(BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE) + +#define BM_RELU(N, C, H, W, TYPE) \ + BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \ + BM_RELU_MACRO(N, C, H, W, TYPE, NEON);\ + BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL); + +BM_RELU(1, 1, 512, 512, float); +BM_RELU(1, 3, 128, 128, float); +BM_RELU(1, 3, 512, 512, float); +BM_RELU(1, 32, 112, 112, float); +BM_RELU(1, 64, 256, 256, float); +} // namespace mace diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc index 56aace07079264d6870411406449ea735dac25ce..879ecb3211f75ebeb791467ae152d40e653ccdc9 100644 --- a/mace/ops/relu_test.cc +++ b/mace/ops/relu_test.cc @@ -12,10 +12,6 @@ class ReluOpTest : public OpsTestBase {}; template void TestSimple() { OpsTestNet net; - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", @@ -23,8 +19,28 @@ void TestSimple() { {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + + OpDefBuilder("Relu", "ReluTest") + .Input("InputImage") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Relu", "ReluTest") + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } auto expected = CreateTensor({2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, @@ -48,20 +64,36 @@ TEST_F(ReluOpTest, OPENCLSimple) { template void TestUnalignedSimple() { OpsTestNet net; - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", - {1, 1, 3, 2}, + {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + + OpDefBuilder("Relu", "ReluTest") + .Input("InputImage") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Relu", "ReluTest") + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } - auto expected = CreateTensor({1, 1, 3, 2}, + auto expected = CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -82,11 +114,6 @@ TEST_F(ReluOpTest, OPENCLUnalignedSimple) { template void TestSimpleReluX() { OpsTestNet net; - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", @@ -94,8 +121,30 @@ void TestSimpleReluX() { {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + + OpDefBuilder("Relu", "ReluTest") + .Input("InputImage") + .Output("OutputImage") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Relu", "ReluTest") + .Input("Input") + .Output("Output") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } auto expected = CreateTensor({2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, @@ -119,21 +168,38 @@ TEST_F(ReluOpTest, OPENCLSimpleReluX) { template void TestUnalignedSimpleReluX() { OpsTestNet net; - OpDefBuilder("Relu", "ReluTest") - .Input("Input") - .Output("Output") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", - {1, 1, 1, 7}, + {1, 1, 7, 1}, {-7, 7, -6, 6, -5, 5, -4}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + + OpDefBuilder("Relu", "ReluTest") + .Input("InputImage") + .Output("OutputImage") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Relu", "ReluTest") + .Input("Input") + .Output("Output") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } - auto expected = CreateTensor({1, 1, 1, 7}, + auto expected = CreateTensor({1, 1, 7, 1}, {0, 6, 0, 6, 0, 5, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5);