From 511ee8785066fb8136d89ea7c543f12037d8d02f Mon Sep 17 00:00:00 2001 From: yejianwu Date: Sun, 3 Dec 2017 16:12:46 +0800 Subject: [PATCH] update batch norm global size to (channel+3)/4, width, height --- mace/kernels/opencl/batch_norm_opencl.cc | 5 +-- mace/kernels/opencl/cl/batch_norm.cl | 49 +++++++----------------- mace/ops/batch_norm_benchmark.cc | 43 ++++++++++++++------- mace/ops/batch_norm_test.cc | 36 ++++++++--------- 4 files changed, 63 insertions(+), 70 deletions(-) diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 67188da1..271ef9da 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -30,7 +30,7 @@ void BatchNormFunctor::operator()( const index_t width_blocks = RoundUpDiv4(width); const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), + static_cast(width), static_cast(height * batchs)}; auto runtime = OpenCLRuntime::Get(); @@ -49,10 +49,7 @@ void BatchNormFunctor::operator()( bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); bm_kernel.setArg(idx++, *(static_cast(epsilon->buffer()))); - bm_kernel.setArg(idx++, static_cast(width)); bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); - bm_kernel.setArg(idx++, lws[0] * sizeof(float) * 4, nullptr); - bm_kernel.setArg(idx++, lws[0] * sizeof(float) * 4, nullptr); auto params_generator = [&kwg_size]()->std::vector> { return {{1, 1, 64}, diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index bc44c2bf..8294d6df 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,52 +1,31 @@ #include // Supported data types: half/float -void kernel batch_norm(__read_only image2d_t input, +__kernel void batch_norm(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, __read_only image2d_t mean, __read_only image2d_t var, global const DATA_TYPE *epsilon, - private const int width, - __write_only image2d_t output, - __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale, - __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) { + __write_only image2d_t output) { const int ch_blk = get_global_id(0); const int w_blk = get_global_id(1); const int hb_blk = get_global_id(2); - - const int local_channel = get_local_id(0); - const int local_w_idx = get_local_id(1); - const int local_hb_idx = get_local_id(2); + const int width = get_global_size(1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - if(local_hb_idx == 0 && local_w_idx == 0) { - VEC_DATA_TYPE(DATA_TYPE, 4) scale4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(scale, sampler, (int2)(ch_blk, 0)); - VEC_DATA_TYPE(DATA_TYPE, 4) offset4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(offset, sampler, (int2)(ch_blk, 0)); - VEC_DATA_TYPE(DATA_TYPE, 4) mean4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(mean, sampler, (int2)(ch_blk, 0)); - VEC_DATA_TYPE(DATA_TYPE, 4) var4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(var, sampler, (int2)(ch_blk, 0)); - new_scale[local_channel] = scale4 * rsqrt(var4 + (VEC_DATA_TYPE(DATA_TYPE, 4))(*epsilon)); - new_offset[local_channel] = offset4 - mean4 * new_scale[local_channel]; - } + DATA_TYPE4 scale_value = READ_IMAGET(scale, sampler, (int2)(ch_blk, 0)); + DATA_TYPE4 offset_value = READ_IMAGET(offset, sampler, (int2)(ch_blk, 0)); + DATA_TYPE4 mean_value = READ_IMAGET(mean, sampler, (int2)(ch_blk, 0)); + DATA_TYPE4 var_value = READ_IMAGET(var, sampler, (int2)(ch_blk, 0)); - barrier(CLK_LOCAL_MEM_FENCE); + DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon)); + DATA_TYPE4 new_offset = offset_value - mean_value * new_scale; - VEC_DATA_TYPE(DATA_TYPE, 4) in[4]; - const int width_pos = w_blk << 2; - const int pos = ch_blk * width + width_pos; - if (width_pos + 4 < width) { - for (int i = 0; i < 4; ++i) { - in[i] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(pos + i, hb_blk)); - VEC_DATA_TYPE(DATA_TYPE, 4) res = in[i] * new_scale[local_channel] + new_offset[local_channel]; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, (int2)(pos + i, hb_blk), res); - } - } else { - for (int i = 0; i < width - width_pos; ++i) { - in[i] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(pos + i, hb_blk)); - VEC_DATA_TYPE(DATA_TYPE, 4) res = in[i] * new_scale[local_channel] + new_offset[local_channel]; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, (int2)(pos + i, hb_blk), res); - } - } -} + const int pos = ch_blk * width + w_blk; + DATA_TYPE4 in = READ_IMAGET(input, sampler, (int2)(pos, hb_blk)); + DATA_TYPE4 out = in * new_scale + new_offset; + WRITE_IMAGET(output, (int2)(pos, hb_blk), out); +} diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index e0d56173..4b34de14 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -13,28 +13,45 @@ static void BatchNorm( int iters, int batch, int channels, int height, int width) { mace::testing::StopTiming(); - if ( D == OPENCL ) - OpenCLRuntime::EnableProfiling(); - OpsTestNet net; - OpDefBuilder("BatchNorm", "BatchNormBM") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .Input("Epsilon") - .Output("Output") - .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("BatchNorm", "BatchNormBM") + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .Input("Epsilon") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + else { + OpDefBuilder("BatchNorm", "BatchNormBM") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .Input("Epsilon") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + + // tuning setenv("MACE_TUNING", "1", 1); net.RunOp(D); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 01e81067..40bb54d1 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -25,11 +25,11 @@ void Simple() { net.AddInputFromArray("Epsilon", {}, {1e-3}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputImage") @@ -44,7 +44,7 @@ void Simple() { net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("BatchNorm", "BatchNormTest") .Input("Input") @@ -202,11 +202,11 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { expected.Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputImage") @@ -227,7 +227,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { net.RunOp(DeviceType::OPENCL); net.Sync(); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } @@ -269,11 +269,11 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // Run on opencl - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); - BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Var", "VarImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputImage") @@ -294,7 +294,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.RunOp(DeviceType::OPENCL); net.Sync(); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } -- GitLab