From 01b400c8ab25efd9411bc142241d2beab2adc5f0 Mon Sep 17 00:00:00 2001 From: xiebaiyuan Date: Fri, 8 May 2020 15:49:28 +0700 Subject: [PATCH] [LITE][OPENCL] optimisei tune logic ,default close ,test=develop (#3576) --- lite/backends/opencl/cl_context.cc | 42 +++++++++++++++++++++++ lite/backends/opencl/cl_context.h | 4 +++ lite/backends/opencl/cl_runtime.cc | 5 +++ lite/backends/opencl/cl_runtime.h | 2 ++ lite/kernels/opencl/conv_image_compute.cc | 34 +++++++++++++++--- 5 files changed, 83 insertions(+), 4 deletions(-) diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index eff959d992..67d679fdd5 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -157,6 +157,48 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, static_cast(gws0)}; #endif } +cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + size_t max_work_size, + int divisor) { + int preferred_lws = 0; +#if 0 + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; +#else + auto gws2 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws0 = global_work_size[2]; +#endif + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; + } + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; + } + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; + } +#if 0 + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; +#else + return cl::NDRange{static_cast(gws2), + static_cast(gws1), + static_cast(gws0)}; +#endif +} + +bool CLContext::IsArmMali() { + return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI; +} cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size) { diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 06e6c7ee46..69ae11a8d7 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -66,6 +66,10 @@ class CLContext { cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); + cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + size_t max_work_size, + int divitor = 2); + bool IsArmMali(); // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size, // size_t max_work_size); diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 93ceb6f9c5..929ec7838e 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() { } return t_str; }; + const std::string device_version = device_->getInfo(); + LOG(INFO) << "device_version:" << device_version; + LOG(INFO) << "device_type:" << device_type_to_str(device_type); device_info_["CL_DEVICE_TYPE"] = device_type; @@ -317,6 +320,8 @@ std::map& CLRuntime::GetDeviceInfo() { return device_info_; } +GpuType& CLRuntime::GetGpuType() { return gpu_type_; } + void CLRuntime::GetAdrenoContextProperties( std::vector* properties, GPUPerfMode gpu_perf_mode, diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 122422c79b..51e545cc34 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -93,6 +93,8 @@ class CLRuntime { std::map& GetDeviceInfo(); + GpuType& GetGpuType(); + private: CLRuntime() { Init(); } diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 3de4512cb1..362be682ef 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -36,7 +36,7 @@ void ConvImageCompute::PrepareForRun() { float* filter_cpu = param.filter->mutable_data(); auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); - + const bool is_mali = context.cl_context()->IsArmMali(); filter_gpu_image_ = std::unique_ptr(new Tensor); tensor_hold_filter_image_ = std::unique_ptr(new Tensor); tensor_hold_bias_image_ = std::unique_ptr(new Tensor); @@ -63,6 +63,7 @@ void ConvImageCompute::PrepareForRun() { bool stride_equal = stride_h == stride_w; bool dilation_equal = dilations[0] == dilations[1]; + VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No"); VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); VLOG(3) << "groups:" << groups << " stride_h:" << stride_h << " stride_w:" << stride_w << " pad_h:" << pad_h @@ -278,7 +279,6 @@ void ConvImageCompute::PrepareForRun() { #endif #undef CONV3x3OPT_FALL_BACK - } else if (kernel_h == 5 && kernel_w == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT @@ -393,7 +393,6 @@ void ConvImageCompute::PrepareForRun() { } #endif #undef CONV_7x7_OPT - } else { LOG(FATAL) << "conv image compute not support this condition yet! "; } @@ -477,6 +476,8 @@ void ConvImageCompute::PrepareForRun() { double min_turn_time = DBL_MAX; cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize( global_work_size_, max_work_group_size); + VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " " + << best_local_work_size[1] << " " << best_local_work_size[2]; cl::NDRange last_local_work_size = cl::NDRange{ static_cast(0), static_cast(0), static_cast(0)}; if (use_turn_) { @@ -495,7 +496,30 @@ void ConvImageCompute::PrepareForRun() { // skiped turned lws continue; } - auto turn_time = this->Turn(5); + auto turn_time = this->Turn(10); + if (min_turn_time > turn_time) { + min_turn_time = turn_time; + best_local_work_size = local_work_size_; + } + last_local_work_size = local_work_size_; + } + // reverse + for (size_t i = 1; i < 15; i++) { + if (kernel_h == 1 && kernel_w == 1) { + // todo use diff logics + local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + global_work_size_, max_work_group_size, i); + } else { + local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + global_work_size_, max_work_group_size, i); + } + if (last_local_work_size[0] == local_work_size_[0] && + last_local_work_size[1] == local_work_size_[1] && + last_local_work_size[2] == local_work_size_[2]) { + // skiped turned lws + continue; + } + auto turn_time = this->Turn(10); if (min_turn_time > turn_time) { min_turn_time = turn_time; best_local_work_size = local_work_size_; @@ -504,6 +528,8 @@ void ConvImageCompute::PrepareForRun() { } } local_work_size_ = best_local_work_size; + VLOG(3) << "chossen :local_work_size_ : " << local_work_size_[0] << " " + << local_work_size_[1] << " " << local_work_size_[2]; VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << "," << local_work_size_[1] << "," << local_work_size_[2] << "}"; } -- GitLab