[LITE][OPENCL] optimisei tune logic ,default close ,test=develop (#3576)

91434cd3 · xiebaiyuan · GitHub · b44f47c4 · 91434cd3 · 91434cd3
5 changed file
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -157,6 +157,48 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
                     static_cast<size_t>(gws0)};
 #endif
 }
+cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+                                                size_t max_work_size,
+                                                int divisor) {
+  int preferred_lws = 0;
+#if 0
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
+#else
+  auto gws2 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws0 = global_work_size[2];
+#endif
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
+  }
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
+  }
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+  }
+#if 0
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
+#else
+  return cl::NDRange{static_cast<size_t>(gws2),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws0)};
+#endif
+}
+bool CLContext::IsArmMali() {
+  return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI;
+}
 cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
                                     size_t max_work_size) {

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -66,6 +66,10 @@ class CLContext {
  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
                                size_t max_work_size,
                                int divitor = 2);
+  cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+                                       size_t max_work_size,
+                                       int divitor = 2);
+  bool IsArmMali();
  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
  //                                   size_t max_work_size);

--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() {
    }
    return t_str;
  };
+  const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
+  LOG(INFO) << "device_version:" << device_version;
  LOG(INFO) << "device_type:" << device_type_to_str(device_type);
  device_info_["CL_DEVICE_TYPE"] = device_type;
@@ -317,6 +320,8 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
  return device_info_;
 }
+GpuType& CLRuntime::GetGpuType() { return gpu_type_; }
 void CLRuntime::GetAdrenoContextProperties(
    std::vector<cl_context_properties>* properties,
    GPUPerfMode gpu_perf_mode,

--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -93,6 +93,8 @@ class CLRuntime {
  std::map<std::string, size_t>& GetDeviceInfo();
+  GpuType& GetGpuType();
 private:
  CLRuntime() { Init(); }

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -36,7 +36,7 @@ void ConvImageCompute::PrepareForRun() {
  float* filter_cpu = param.filter->mutable_data<float>();
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
+  const bool is_mali = context.cl_context()->IsArmMali();
  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
  tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
  tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
@@ -63,6 +63,7 @@ void ConvImageCompute::PrepareForRun() {
  bool stride_equal = stride_h == stride_w;
  bool dilation_equal = dilations[0] == dilations[1];
+  VLOG(3) << "Is arm mali  / " << (is_mali ? "Yes" : "No");
  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
          << " stride_w:" << stride_w << " pad_h:" << pad_h
@@ -278,7 +279,6 @@ void ConvImageCompute::PrepareForRun() {
 #endif
 #undef CONV3x3OPT_FALL_BACK
  } else if (kernel_h == 5 && kernel_w == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -393,7 +393,6 @@ void ConvImageCompute::PrepareForRun() {
    }
 #endif
 #undef CONV_7x7_OPT
  } else {
    LOG(FATAL) << "conv image compute not support this condition yet! ";
  }
@@ -477,6 +476,8 @@ void ConvImageCompute::PrepareForRun() {
    double min_turn_time = DBL_MAX;
    cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
        global_work_size_, max_work_group_size);
+    VLOG(3) << "origin  :local_work_size_ : " << best_local_work_size[0] << " "
+            << best_local_work_size[1] << " " << best_local_work_size[2];
    cl::NDRange last_local_work_size = cl::NDRange{
        static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
    if (use_turn_) {
@@ -495,7 +496,30 @@ void ConvImageCompute::PrepareForRun() {
          // skiped turned lws
          continue;
        }
-        auto turn_time = this->Turn(5);
+        auto turn_time = this->Turn(10);
+        if (min_turn_time > turn_time) {
+          min_turn_time = turn_time;
+          best_local_work_size = local_work_size_;
+        }
+        last_local_work_size = local_work_size_;
+      }
+      // reverse
+      for (size_t i = 1; i < 15; i++) {
+        if (kernel_h == 1 && kernel_w == 1) {
+          // todo use diff logics
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+              global_work_size_, max_work_group_size, i);
+        } else {
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+              global_work_size_, max_work_group_size, i);
+        }
+        if (last_local_work_size[0] == local_work_size_[0] &&
+            last_local_work_size[1] == local_work_size_[1] &&
+            last_local_work_size[2] == local_work_size_[2]) {
+          // skiped turned lws
+          continue;
+        }
+        auto turn_time = this->Turn(10);
        if (min_turn_time > turn_time) {
          min_turn_time = turn_time;
          best_local_work_size = local_work_size_;
@@ -504,6 +528,8 @@ void ConvImageCompute::PrepareForRun() {
      }
    }
    local_work_size_ = best_local_work_size;
+    VLOG(3) << "chossen :local_work_size_ : " << local_work_size_[0] << " "
+            << local_work_size_[1] << " " << local_work_size_[2];
    VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
            << local_work_size_[1] << "," << local_work_size_[2] << "}";
  }