[LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog… (#3309)

* [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop

[LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog… (#3309)
* [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop * [LITE][OPENCL][Image] add lws turn & close cl check when shutdownlog , test=develop
942bc409 · xiebaiyuan · GitHub · 185c7096 · 942bc409 · 942bc409
5 changed file
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "lite/backends/opencl/cl_context.h"
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -121,14 +122,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
  }
 }

+cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+                                         size_t max_work_size,
+                                         int divisor) {
+  int preferred_lws = 0;
+#if 1
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
+#else
+  auto gws2 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws0 = global_work_size[2];
+#endif
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
+  }
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
+  }
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+  }
+#if 1
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
+#else
+  return cl::NDRange{static_cast<size_t>(gws2),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws0)};
+#endif
+}
+
 cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
                                     size_t max_work_size) {
  int preferred_lws = 0;
  int divisor = 2;

-  auto tmp0 = global_work_size[0];
-  auto tmp1 = global_work_size[1];
-  auto tmp2 = global_work_size[2];
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];

  if (divisor > 1) {
    max_work_size /= divisor;
@@ -136,18 +176,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
    max_work_size = preferred_lws;
  }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
  }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
  }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
  }
-  return cl::NDRange{static_cast<size_t>(tmp0),
-                     static_cast<size_t>(tmp1),
-                     static_cast<size_t>(tmp2)};
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
 }

 }  // namespace lite

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -45,6 +45,11 @@ class CLContext {
  cl::NDRange DefaultWorkSize(const CLImage &image);

  cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
+  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+                                size_t max_work_size,
+                                int divitor = 2);
+  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
+  //                                   size_t max_work_size);

 private:
  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;

--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
        __FILE__,                                                    \
        __LINE__);                                                   \
  }
-
+#ifndef LITE_SHUTDOWN_LOG
 #define CL_CHECK_FATAL(err_code__)                                   \
  if (err_code__ != CL_SUCCESS) {                                    \
    LOG(FATAL) << string_format(                                     \
@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
        __FILE__,                                                    \
        __LINE__);                                                   \
  }
+#else
+#define CL_CHECK_FATAL(err_code__)
+#endif
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -388,18 +388,43 @@ void ConvImageCompute::PrepareForRun() {

  VLOG(4) << "max_work_group_size: " << max_work_group_size;

-  if (max_work_group_size > 0 && use_lws) {
-    // local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
-    //     global_work_size_, max_work_group_size);
-    local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
-                                                           max_work_group_size);
-
+  if (max_work_group_size > 0 && use_lws_) {
+    double min_turn_time = DBL_MAX;
+    cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
+        global_work_size_, max_work_group_size);
+    cl::NDRange last_local_work_size = cl::NDRange{
+        static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
+    if (use_turn_) {
+      for (size_t i = 1; i < 15; i++) {
+        if (kernel_h == 1 && kernel_w == 1) {
+          // todo use diff logics
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+              global_work_size_, max_work_group_size, i);
+        } else {
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+              global_work_size_, max_work_group_size, i);
+        }
+        if (last_local_work_size[0] == local_work_size_[0] &&
+            last_local_work_size[1] == local_work_size_[1] &&
+            last_local_work_size[2] == local_work_size_[2]) {
+          // skiped turned lws
+          continue;
+        }
+        auto turn_time = this->Turn(5);
+        if (min_turn_time > turn_time) {
+          min_turn_time = turn_time;
+          best_local_work_size = local_work_size_;
+        }
+        last_local_work_size = local_work_size_;
+      }
+    }
+    local_work_size_ = best_local_work_size;
    VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
            << local_work_size_[1] << "," << local_work_size_[2] << "}";
  }
 }

-void ConvImageCompute::Conv2d1x1opt() {
+void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -431,23 +456,6 @@ void ConvImageCompute::Conv2d1x1opt() {
  int input_c = input_dims[1];
  auto dilations = *param.dilations;

-// const std::vector<size_t>& default_work_size =
-//     DefaultWorkSize(output_dims,
-//                     DDim(std::vector<DDim::value_type>{
-//                         static_cast<int64_t>(out_image_shape["width"]),
-//                         static_cast<int64_t>(out_image_shape["height"])}));
-
-// int c_block = default_work_size[0];
-// int w = default_work_size[1];
-// int nh = default_work_size[2];
-
-// int maped_w = maptofactor(w, 4);
-
-// auto global_work_size_ =
-//     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-//                 static_cast<size_t>(maped_w),
-//                 static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
@@ -541,73 +549,12 @@ void ConvImageCompute::Conv2d1x1opt() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
-
-#ifdef PROFILE_CONV_KERNEL
-  bool use_profile = false;
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  double start = GetCurrentUS();
-
-  if (use_profile) {
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size_,
-        local_work_size_,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
-  } else {
-    int count = 50;
-    double sumtime = 0;
-    if (!use_profile) {
-      count = 1;
-    }
-    for (size_t i = 0; i < count; i++) {
-      start = GetCurrentUS();
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size_,
-          local_work_size_,
-          nullptr,
-          event_.get());
-      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_image, event_);
-      if (use_profile) {
-        event_->wait();
-        double duration = GetCurrentUS() - start;
-        sumtime += duration;
-      }
-    }
-
-    auto dims_string = [](DDimLite dims) -> std::string {
-      std::ostringstream stream;
-      stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
-             << dims[3] << "]";
-      return stream.str();
-    };
-    if (use_profile) {
-      // LOG(INFO) << "input: " << input_dims;
-      // LOG(INFO) << "filter: " << filter_dims;
-      // LOG(INFO) << "output: " << output_dims;
-
-      std::cout << std::setw(25) << std::left << dims_string(input_dims)
-                << std::setw(25) << std::left << dims_string(filter_dims)
-                << std::setw(25) << std::left << dims_string(output_dims)
-                << std::setw(25) << std::left << sumtime / count << std::endl;
-    } else {
-      dims_string(input_dims);
-    }
+  if (is_turn) {
+    event_->wait();
  }
-#endif
 }

-void ConvImageCompute::Conv2d3x3() {
+void ConvImageCompute::Conv2d3x3(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -767,9 +714,13 @@ void ConvImageCompute::Conv2d3x3() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }

-void ConvImageCompute::Conv2d3x3opt() {
+void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -890,9 +841,12 @@ void ConvImageCompute::Conv2d3x3opt() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }

-void ConvImageCompute::Conv2d5x5() {
+void ConvImageCompute::Conv2d5x5(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1018,9 +972,12 @@ void ConvImageCompute::Conv2d5x5() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }

-void ConvImageCompute::Conv2d5x5opt() {
+void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1134,9 +1091,12 @@ void ConvImageCompute::Conv2d5x5opt() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }

-void ConvImageCompute::Conv2d7x7() {
+void ConvImageCompute::Conv2d7x7(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1262,8 +1222,12 @@ void ConvImageCompute::Conv2d7x7() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
-void ConvImageCompute::Conv2d7x7opt() {
+void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1374,8 +1338,12 @@ void ConvImageCompute::Conv2d7x7opt() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
-void ConvImageCompute::DepthwiseConv2d3x3s1() {
+void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1454,9 +1422,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(output_img, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }

-void ConvImageCompute::DepthwiseConv2d3x3() {
+void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1548,9 +1520,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(output_img, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }

-void ConvImageCompute::DepthwiseConv2d() {
+void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
@@ -1683,8 +1659,22 @@ void ConvImageCompute::DepthwiseConv2d() {
  context.cl_wait_list()->emplace(out_image, event_);
 }

-void ConvImageCompute::Run() { (this->*impl_)(); }
-#undef PROFILE_CONV_KERNEL
+void ConvImageCompute::Run() { (this->*impl_)(false); }
+
+double ConvImageCompute::Turn(int times) {
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start = GetCurrentUS();
+  for (size_t i = 0; i < times; i++) {
+    (this->*impl_)(true);
+  }
+  auto time_diff = (GetCurrentUS() - start) / times;
+  return time_diff;
+}
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -28,29 +28,29 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                           PRECISION(kFP16),
                                           DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::ConvParam;
-  using kernel_t = void (ConvImageCompute::*)();
+  using kernel_t = void (ConvImageCompute::*)(bool);

  void PrepareForRun() override;

  void Run() override;
+  double Turn(int times = 5);

 private:
-  void Conv2d1x1opt();
-  void Conv2d3x3();
-  void Conv2d3x3opt();
-  void Conv2d5x5();
-  void Conv2d5x5opt();
-  void Conv2d7x7();
-  void Conv2d7x7opt();
-  void DepthwiseConv2d3x3s1();
-  void DepthwiseConv2d3x3();
-  void DepthwiseConv2d();
+  void Conv2d1x1opt(bool is_turn = false);
+  void Conv2d3x3(bool is_turn = false);
+  void Conv2d3x3opt(bool is_turn = false);
+  void Conv2d5x5(bool is_turn = false);
+  void Conv2d5x5opt(bool is_turn = false);
+  void Conv2d7x7(bool is_turn = false);
+  void Conv2d7x7opt(bool is_turn = false);
+  void DepthwiseConv2d3x3s1(bool is_turn = false);
+  void DepthwiseConv2d3x3(bool is_turn = false);
+  void DepthwiseConv2d(bool is_turn = false);

  kernel_t impl_;
  std::vector<std::string> kernel_func_names_{};
@@ -72,7 +72,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  cl::Kernel kernel_;
  cl::NDRange local_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
-  bool use_lws{true};
+  bool use_lws_{true};
+  bool use_turn_{false};
 };

 }  // namespace opencl