diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 23bedc6eb30a2753ba67c966c8102385dd293dd0..82efa4595ae6e4f091ce9618dc4e1b16199f382d 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -20,9 +20,12 @@ void BatchNormFunctor::operator()( const Tensor *epsilon, Tensor *output) { + index_t pixel_size = input->dim(2) * input->dim(3); + index_t blocks = (pixel_size + 3) / 4; + const uint32_t gws[3] = {static_cast(input->dim(0)), static_cast(input->dim(1)), - static_cast(input->dim(2) * input->dim(3))}; + static_cast(blocks)}; auto runtime = OpenCLRuntime::Get(); @@ -39,10 +42,10 @@ void BatchNormFunctor::operator()( bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); bm_kernel.setArg(idx++, *(static_cast(epsilon->buffer()))); - bm_kernel.setArg(idx++, gws[2]); + bm_kernel.setArg(idx++, static_cast(pixel_size)); bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); - bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr); - bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr); + bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr); + bm_kernel.setArg(idx++, lws[1] * sizeof(float) * 4, nullptr); auto params_generator = [&kwg_size]()->std::vector> { return {{1, 1, 64}, diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index e86c62f336dd62b2aab0d5226f084756eff8350d..3fc449ce499723b4ab78b4e2627be537fe40e978 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -6,8 +6,8 @@ void kernel batch_norm(global const float *input, global const float *epsilon, private const uint pixels, global float *output, - __local float *new_scale, - __local float *new_offset) { + __local float4 *new_scale, + __local float4 *new_offset) { const int batch = get_global_id(0); const int channel = get_global_id(1); const int channels = get_global_size(1); @@ -16,15 +16,26 @@ void kernel batch_norm(global const float *input, const int local_pixel_idx = get_local_id(2); if(local_pixel_idx == 0) { - new_scale[local_channel] = scale[channel] * rsqrt(var[channel] + *epsilon); - new_offset[local_channel] = offset[channel] - mean[channel] * new_scale[local_channel]; + new_scale[local_channel] = (float4)(scale[channel] * rsqrt(var[channel] + *epsilon)); + new_offset[local_channel] = (float4)(offset[channel] - mean[channel] * new_scale[local_channel].x); } barrier(CLK_LOCAL_MEM_FENCE); - const int sample_offset = (batch * channels + channel) * pixels + pixel_offset; - const float *input_ptr = input + sample_offset; - float *output_ptr = output + sample_offset; - *output_ptr = new_scale[local_channel] * *input_ptr + new_offset[local_channel]; + const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4; + const float *input_ptr = input + image_offset; + float *output_ptr = output + image_offset; + const int end = (batch * channels + channel + 1) * pixels; + if ((image_offset+4) > end) { + for (int i = image_offset; i < end; ++i) { + *output_ptr = new_scale[local_channel].x * *input_ptr + new_offset[local_channel].x; + ++input_ptr; + ++output_ptr; + } + } else { + float4 values = vload4(0, input_ptr); + values = values * new_scale[local_channel] + new_offset[local_channel]; + vstore4(values, 0, output_ptr); + } } diff --git a/mace/utils/BUILD b/mace/utils/BUILD index 50f65f4e9010686de01f97298ba7ef75d7abcdad..cd5583dfc9f35b214c952c8647e20dfb0c091c3d 100644 --- a/mace/utils/BUILD +++ b/mace/utils/BUILD @@ -39,6 +39,7 @@ cc_library( copts = ["-std=c++11"], deps = [ "//mace/core", + "//mace/core:opencl_runtime", ], ) diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index de96f87e27402ba757d3cc713a7b0146c7d5bb49..1d36f7f5b170fc109bc7596bb556b0e8e3ed6959 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -33,7 +33,7 @@ class Tuner { const std::function>()> ¶m_generator, const std::function &)> &func) { - if (IsTuning()) { + if (IsTuning() && param_generator != nullptr) { // tune std::vector opt_param = default_param; RetType res = Tune(param_generator, func, opt_param); @@ -68,7 +68,7 @@ class Tuner { } inline void WriteRunParameters() { - VLOG(0) << path_; + VLOG(1) << path_; if (path_ != nullptr) { std::ofstream ofs(path_, std::ios::binary | std::ios::out); if (ofs.is_open()) { @@ -78,14 +78,14 @@ class Tuner { int32_t key_size = kp.first.size(); ofs.write(reinterpret_cast(&key_size), sizeof(key_size)); ofs.write(kp.first.c_str(), key_size); - VLOG(0) << kp.first.c_str(); + VLOG(1) << kp.first.c_str(); auto ¶ms = kp.second; int32_t params_size = params.size() * sizeof(param_type); ofs.write(reinterpret_cast(¶ms_size), sizeof(params_size)); for (auto ¶m : params) { ofs.write(reinterpret_cast(¶m), sizeof(params_size)); - VLOG(0) << param; + VLOG(1) << param; } } ofs.close(); @@ -144,7 +144,7 @@ class Tuner { } template - inline RetType Tune(std::function>()> param_generator, + inline RetType Tune(const std::function>()> ¶m_generator, const std::function &)> &func, std::vector &opt_params) { RetType res; diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc index bcb5c620f3b553d3d6f8572fd88573d159d0a6cd..ea80dd4dd87716245ceb32ccb1c2c9d8a197df2a 100644 --- a/mace/utils/tuner_test.cc +++ b/mace/utils/tuner_test.cc @@ -13,7 +13,8 @@ class TunerTest: public ::testing::Test { protected: virtual void SetUp() { remove( "/data/local/tmp/mace.config" ); - setenv("MACE_RUN_PARAMTER_PATH", "/data/local/tmp/mace.config", 1); + setenv("MACE_RUN_PARAMETER_PATH", "/data/local/tmp/mace.config", 1); + setenv("MACE_TUNING", "1", 1); } };