[LITE][OPENCL] Fix reshape enhance elemul opencl; Enhance precision profiler. etc. (#3268)

* [LITE][OPENCL] enhance elemul, fix reshape for opencl. test=develop * comment loop test for pad2 opencl kernel. test=develop * [LITE][PROFILE] add sequnce statics for precision profiler. test=develop

[LITE][OPENCL] Fix reshape enhance elemul opencl; Enhance precision profiler. etc. (#3268)
* [LITE][OPENCL] enhance elemul, fix reshape for opencl. test=develop * comment loop test for pad2 opencl kernel. test=develop * [LITE][PROFILE] add sequnce statics for precision profiler. test=develop
13cdbc5e · Yuan Shuai · GitHub · d8438e83 · 13cdbc5e · 13cdbc5e
4 changed file
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -33,13 +33,14 @@ namespace lite {
 namespace profile {

 template <typename dtype>
-static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
+static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
  if (locate.find('/') != std::string::npos) {
-    return;
+    return false;
  }
  FILE* fp = fopen(locate.c_str(), "w");
  if (fp == nullptr) {
    LOG(ERROR) << "file open field " << locate;
+    return false;
  } else {
    const dtype* data = tensor->data<dtype>();
    for (int i = 0; i < tensor->numel(); ++i) {
@@ -47,6 +48,7 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
    }
  }
  fclose(fp);
+  return true;
 }

 class PrecisionProfiler {
@@ -69,9 +71,10 @@ class PrecisionProfiler {
       << "=========================================" << std::endl;
    ss << setw(45) << left << "operator:(kernel_info)"
       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
-       << " " << setw(15) << left << "tensor_dims"
-       << " " << setw(15) << left << "tensor_mean"
-       << " " << setw(15) << left << "tensor_standard_deviation" << std::endl;
+       << " " << setw(15) << left << "dims"
+       << " " << setw(15) << left << "mean"
+       << " " << setw(15) << left << "std_deviation"
+       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;

    return ss.str();
  }
@@ -102,6 +105,17 @@ class PrecisionProfiler {
    return sqrt(variance);
  }

+  template <typename T>
+  double compute_average_grow_rate(const T* in, const size_t length) {
+    const double eps = 1e-5;
+    double ave_grow_rate = 0.0f;
+    for (size_t i = 1; i < length; ++i) {
+      ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
+    }
+    ave_grow_rate /= length;
+    return ave_grow_rate;
+  }
+
  // check if output tensor unused
  bool is_unused(const Tensor* in) {
    if (!in->data<int8_t>()) {
@@ -116,7 +130,9 @@ class PrecisionProfiler {
                                     DataLayoutType layout_type,
                                     double* mean,
                                     double* std_dev,
-                                     std::string name = "inst") {
+                                     double* ave_grow_rate,
+                                     std::string name = "inst",
+                                     bool write_result_to_file = false) {
    std::string unsupported_error_log =
        "Unsupported precision profile for kernel registered on" +
        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
@@ -127,39 +143,44 @@ class PrecisionProfiler {
      switch (precision_type) {
        case PRECISION(kFloat): {
          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
          *mean = compute_mean<float>(ptr, in->numel());
          *std_dev =
              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
          return;
        }
        case PRECISION(kAny): {
          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
          *mean = compute_mean<float>(ptr, in->numel());
          *std_dev =
              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
          return;
        }
        case PRECISION(kInt8): {
          auto ptr = in->data<int8_t>();
-          // write_tensorfile<int8_t>(in, name);
          *mean = compute_mean<int8_t>(ptr, in->numel());
          *std_dev =
              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int8_t>(in, name);
          return;
        }
        case PRECISION(kInt32): {
          auto ptr = in->data<int32_t>();
-          // write_tensorfile<int32_t>(in, name);
          *mean = compute_mean<int32_t>(ptr, in->numel());
          *std_dev = compute_standard_deviation<int32_t>(
              ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int32_t>(in, name);
          return;
        }
        default:
          *mean = -333333333333;
          *std_dev = -33333333333;
+          *ave_grow_rate = -33333333333;
          LOG(ERROR) << unsupported_error_log;
          return;
      }
@@ -186,11 +207,13 @@ class PrecisionProfiler {
                                      IoDirection::DtoH);
          default_convertor.ImageToNCHW(
              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
-          // write_tensorfile<float>(in, name);
          CHECK(real_out_v.size() == in->numel());
          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
          *std_dev = compute_standard_deviation<float>(
              real_out_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
+                                                            real_out_v.size());
+          write_result_to_file&& write_tensorfile<float>(in, name);
          return;
        }
        case DATALAYOUT(kNCHW): {
@@ -203,11 +226,15 @@ class PrecisionProfiler {
          *mean = compute_mean<float>(in_data_v.data(), in->numel());
          *std_dev = compute_standard_deviation<float>(
              in_data_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate =
+              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
          return;
        }
        default:
          *mean = -222222222222;
          *std_dev = -22222222222;
+          *ave_grow_rate = -22222222222;
          LOG(ERROR) << unsupported_error_log;
          return;
      }
@@ -215,6 +242,7 @@ class PrecisionProfiler {
    } else {
      *mean = -111111111111;
      *std_dev = -11111111111;
+      *ave_grow_rate = -11111111111;
      LOG(ERROR) << unsupported_error_log;
      return;
    }
@@ -225,6 +253,7 @@ class PrecisionProfiler {
    using std::left;
    using std::fixed;
    STL::stringstream ss;
+    bool write_result_to_file = false;

    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
@@ -252,8 +281,10 @@ class PrecisionProfiler {
              op_scope->FindVar(out_name)->GetMutable<Tensor>();
          double mean = -999999;
          double std_dev = -100000;
+          double ave_grow_rate = 99999;
          std::string mean_str{"unused"};
          std::string std_dev_str{"unused"};
+          std::string ave_grow_rate_str{"unused"};

          if (!is_unused(tout)) {
            compute_tensor_precision_info(tout,
@@ -262,9 +293,12 @@ class PrecisionProfiler {
                                          type->layout(),
                                          &mean,
                                          &std_dev,
-                                          out_name);
-            mean_str = paddle::lite::to_string(mean);
-            std_dev_str = paddle::lite::to_string(std_dev);
+                                          &ave_grow_rate,
+                                          out_name,
+                                          write_result_to_file);
+            mean_str = std::to_string(mean);
+            std_dev_str = std::to_string(std_dev);
+            ave_grow_rate_str = std::to_string(ave_grow_rate);
          }
          std::string kernel_info = op_name + ":" + kernel_place;
          std::string output_arg_info = out_name + ":" +
@@ -275,7 +309,8 @@ class PrecisionProfiler {
          ss << setw(45) << left << kernel_info << " " << setw(70) << left
             << output_arg_info << " " << setw(15) << left << tout->dims()
             << " " << setw(15) << left << mean_str << " " << setw(15) << left
-             << std_dev_str << std::endl;
+             << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+             << std::endl;
        } else if (type->IsTensorList()) {
          auto touts =
              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
@@ -283,8 +318,10 @@ class PrecisionProfiler {
            const Tensor* tout = &t;
            double mean = -999999;
            double std_dev = -100000;
+            double ave_grow_rate = 99999;
            std::string mean_str{"unused"};
            std::string std_dev_str{"unused"};
+            std::string ave_grow_rate_str{"unused"};

            if (!is_unused(tout)) {
              compute_tensor_precision_info(tout,
@@ -293,9 +330,12 @@ class PrecisionProfiler {
                                            type->layout(),
                                            &mean,
                                            &std_dev,
-                                            out_name);
-              mean_str = paddle::lite::to_string(mean);
-              std_dev_str = paddle::lite::to_string(std_dev);
+                                            &ave_grow_rate,
+                                            out_name,
+                                            write_result_to_file);
+              mean_str = std::to_string(mean);
+              std_dev_str = std::to_string(std_dev);
+              ave_grow_rate_str = std::to_string(ave_grow_rate);
            }
            std::string kernel_info = op_name + ":" + kernel_place;
            std::string output_arg_info = out_name + ":" +
@@ -306,7 +346,8 @@ class PrecisionProfiler {
            ss << setw(45) << left << kernel_info << " " << setw(70) << left
               << output_arg_info << " " << setw(15) << left << tout->dims()
               << " " << setw(15) << left << mean_str << " " << setw(15) << left
-               << std_dev_str << std::endl;
+               << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+               << std::endl;
          }
        }
      }

--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -56,7 +56,7 @@ class ElementwiseMulImageCompute
      } else {
        kernel_func_name_ = "channel_mul_d2_hw";
      }
-    } else if (y_dims.size() == 4) {
+    } else if (y_dims.size() == 4 || x_dims.size() == 4) {
      kernel_func_name_ = "channel_mul_d4";
    } else {
      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
@@ -172,6 +172,18 @@ class ElementwiseMulImageCompute
        status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
        CL_CHECK_FATAL(status);
      }
+    } else if (x_dims.size() == 4) {
+      auto tensor_w = y_dims[y_dims.size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+      // kernel: channel_mul_d4
+      cl_int status = kernel.setArg(arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
    } else {
      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
                 << y_dims.size();

--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
@@ -89,7 +89,7 @@ void pad2d_ref(const float *x_data,
  }
 }

-#define LOOP_TEST
+// #define LOOP_TEST
 // #define PRINT_RESULT
 TEST(pad2d_image2d, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "

--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -203,8 +203,8 @@ REGISTER_LITE_KERNEL(reshape2,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),
@@ -221,7 +221,7 @@ REGISTER_LITE_KERNEL(flatten,
               {LiteType::GetTensorTy(TARGET(kOpenCL),
                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),
@@ -239,7 +239,7 @@ REGISTER_LITE_KERNEL(flatten2,
                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFP16),