diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index eff959d992200592c21a024f56713b9abb4b87fb..67d679fdd596b109b714bf7ba3cd45b2632b9420 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -157,6 +157,48 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, static_cast(gws0)}; #endif } +cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + size_t max_work_size, + int divisor) { + int preferred_lws = 0; +#if 0 + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; +#else + auto gws2 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws0 = global_work_size[2]; +#endif + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; + } + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; + } + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; + } +#if 0 + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; +#else + return cl::NDRange{static_cast(gws2), + static_cast(gws1), + static_cast(gws0)}; +#endif +} + +bool CLContext::IsArmMali() { + return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI; +} cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size) { diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 06e6c7ee46d8b839873d433843f0035e3963664c..69ae11a8d71cc8c3dcae2b7ba81b4e19b44d1abe 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -66,6 +66,10 @@ class CLContext { cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); + cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + size_t max_work_size, + int divitor = 2); + bool IsArmMali(); // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size, // size_t max_work_size); diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 93ceb6f9c5c55a406ca51b8c2d6279d304293fb3..929ec7838e23b9ca9259c19cd1808379664dbec3 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() { } return t_str; }; + const std::string device_version = device_->getInfo(); + LOG(INFO) << "device_version:" << device_version; + LOG(INFO) << "device_type:" << device_type_to_str(device_type); device_info_["CL_DEVICE_TYPE"] = device_type; @@ -317,6 +320,8 @@ std::map& CLRuntime::GetDeviceInfo() { return device_info_; } +GpuType& CLRuntime::GetGpuType() { return gpu_type_; } + void CLRuntime::GetAdrenoContextProperties( std::vector* properties, GPUPerfMode gpu_perf_mode, diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 122422c79b80bc7e0823136cac150613d8c597dc..51e545cc3482ed7d080baa2734c8f84d8b486d3e 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -93,6 +93,8 @@ class CLRuntime { std::map& GetDeviceInfo(); + GpuType& GetGpuType(); + private: CLRuntime() { Init(); } diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 3de4512cb1d9d06b95d14c51615d5ab87e0a7419..362be682efc1c2330e27840ffded9586fa53ddf9 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -36,7 +36,7 @@ void ConvImageCompute::PrepareForRun() { float* filter_cpu = param.filter->mutable_data(); auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); - + const bool is_mali = context.cl_context()->IsArmMali(); filter_gpu_image_ = std::unique_ptr(new Tensor); tensor_hold_filter_image_ = std::unique_ptr(new Tensor); tensor_hold_bias_image_ = std::unique_ptr(new Tensor); @@ -63,6 +63,7 @@ void ConvImageCompute::PrepareForRun() { bool stride_equal = stride_h == stride_w; bool dilation_equal = dilations[0] == dilations[1]; + VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No"); VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); VLOG(3) << "groups:" << groups << " stride_h:" << stride_h << " stride_w:" << stride_w << " pad_h:" << pad_h @@ -278,7 +279,6 @@ void ConvImageCompute::PrepareForRun() { #endif #undef CONV3x3OPT_FALL_BACK - } else if (kernel_h == 5 && kernel_w == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT @@ -393,7 +393,6 @@ void ConvImageCompute::PrepareForRun() { } #endif #undef CONV_7x7_OPT - } else { LOG(FATAL) << "conv image compute not support this condition yet! "; } @@ -477,6 +476,8 @@ void ConvImageCompute::PrepareForRun() { double min_turn_time = DBL_MAX; cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize( global_work_size_, max_work_group_size); + VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " " + << best_local_work_size[1] << " " << best_local_work_size[2]; cl::NDRange last_local_work_size = cl::NDRange{ static_cast(0), static_cast(0), static_cast(0)}; if (use_turn_) { @@ -495,7 +496,30 @@ void ConvImageCompute::PrepareForRun() { // skiped turned lws continue; } - auto turn_time = this->Turn(5); + auto turn_time = this->Turn(10); + if (min_turn_time > turn_time) { + min_turn_time = turn_time; + best_local_work_size = local_work_size_; + } + last_local_work_size = local_work_size_; + } + // reverse + for (size_t i = 1; i < 15; i++) { + if (kernel_h == 1 && kernel_w == 1) { + // todo use diff logics + local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + global_work_size_, max_work_group_size, i); + } else { + local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + global_work_size_, max_work_group_size, i); + } + if (last_local_work_size[0] == local_work_size_[0] && + last_local_work_size[1] == local_work_size_[1] && + last_local_work_size[2] == local_work_size_[2]) { + // skiped turned lws + continue; + } + auto turn_time = this->Turn(10); if (min_turn_time > turn_time) { min_turn_time = turn_time; best_local_work_size = local_work_size_; @@ -504,6 +528,8 @@ void ConvImageCompute::PrepareForRun() { } } local_work_size_ = best_local_work_size; + VLOG(3) << "chossen :local_work_size_ : " << local_work_size_[0] << " " + << local_work_size_[1] << " " << local_work_size_[2]; VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << "," << local_work_size_[1] << "," << local_work_size_[2] << "}"; }