diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 5f00d747967695bfd379f331c03033eeaebd20f9..36b2925742ce6214d3d4d41146221750f47a35b2 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -28,10 +28,9 @@ struct BatchNormFunctor { // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } // new_offset = \offset - mean * common_val; // Y = new_scale * X + new_offset; - const index_t batchs = input->dim(0); + const index_t batch = input->dim(0); const index_t height = input->dim(1); const index_t width = input->dim(2); - const index_t height_width = height * width; const index_t channels = input->dim(3); Tensor::MappingGuard input_mapper(input); @@ -62,11 +61,13 @@ struct BatchNormFunctor { index_t pos = 0; #pragma omp parallel for - for (index_t n = 0; n < batchs; ++n) { - for (index_t hb = 0; hb < height_width; ++hb) { - for (index_t c = 0; c < channels; ++c) { - output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; - ++pos; + for (index_t n = 0; n < batch; ++n) { + for (index_t h = 0; h < height; ++h) { + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < channels; ++c) { + output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; + ++pos; + } } } } diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index c810d9290259912660a35b522dac5661d2fa9c11..c17286895a8868732ada5608d9454cae31cdd746 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -21,7 +21,7 @@ void BatchNormFunctor::operator()( const Tensor *epsilon, Tensor *output) { - const index_t batchs = input->dim(0); + const index_t batch = input->dim(0); const index_t height = input->dim(1); const index_t width = input->dim(2); const index_t channels = input->dim(3); @@ -30,7 +30,7 @@ void BatchNormFunctor::operator()( const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), - static_cast(height * batchs)}; + static_cast(height * batch)}; auto runtime = OpenCLRuntime::Get(); std::set built_options; @@ -40,7 +40,7 @@ void BatchNormFunctor::operator()( auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - const std::vector lws = {1, 1, kwg_size}; + const std::vector lws = {1, kwg_size, 1}; uint32_t idx = 0; bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); @@ -52,7 +52,8 @@ void BatchNormFunctor::operator()( bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); auto params_generator = [&kwg_size]()->std::vector> { - return {{1, 1, 64}, + return {{8, 128, 1}, //SNPE size + {1, 1, 64}, {1, 1, 128}, {1, kwg_size/16, 16}, {1, kwg_size/32, 32}, diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index e3d5ae5ec01c19679391c86d7e04015df6ac23b2..d0ad2e2aca77a2cc0fb7a51a8a4671060842b077 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -8,24 +8,21 @@ __kernel void batch_norm(__read_only image2d_t input, __global const DATA_TYPE *epsilon, __write_only image2d_t output) { const int ch_blk = get_global_id(0); - const int w_blk = get_global_id(1); - const int hb_blk = get_global_id(2); + const int w = get_global_id(1); + const int hb = get_global_id(2); const int width = get_global_size(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - - DATA_TYPE4 scale_value = READ_IMAGET(scale, sampler, (int2)(ch_blk, 0)); - DATA_TYPE4 offset_value = READ_IMAGET(offset, sampler, (int2)(ch_blk, 0)); - DATA_TYPE4 mean_value = READ_IMAGET(mean, sampler, (int2)(ch_blk, 0)); - DATA_TYPE4 var_value = READ_IMAGET(var, sampler, (int2)(ch_blk, 0)); + DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon)); DATA_TYPE4 new_offset = offset_value - mean_value * new_scale; - const int pos = ch_blk * width + w_blk; + const int pos = ch_blk * width + w; - DATA_TYPE4 in = READ_IMAGET(input, sampler, (int2)(pos, hb_blk)); + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 out = in * new_scale + new_offset; - WRITE_IMAGET(output, (int2)(pos, hb_blk), out); + WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 40bb54d13b4882c4aa5e7e6a4abd2097370c37fd..73e386caab16bbaff893fb56553a5ba3c4d5bae0 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -5,8 +5,6 @@ #include "mace/core/operator.h" #include "mace/ops/ops_test_util.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" - namespace mace { class BatchNormOpTest : public OpsTestBase {};