[LITE][OPENCL][PROFILE] Enhance precision profile & Clean opencl code (#3227)

* [LITE][OPENCL] clean code for opencl. test=develop * [LITE][PROFILER] Enhance Precision Profiler. test=develop * delete useless var in profiler. test=develop * add ocl header. test=develop

[LITE][OPENCL][PROFILE] Enhance precision profile & Clean opencl code (#3227)
* [LITE][OPENCL] clean code for opencl. test=develop * [LITE][PROFILER] Enhance Precision Profiler. test=develop * delete useless var in profiler. test=develop * add ocl header. test=develop
3868be2c · Yuan Shuai · GitHub · 3896590b · 3868be2c · 3868be2c
5 changed file
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
 lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
-lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)

 add_dependencies(cl_wrapper opencl_clhpp)
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,6 +22,12 @@
 #include <vector>
 #include "lite/core/program.h"

+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/kernels/opencl/image_helper.h"
+#endif
+
 namespace paddle {
 namespace lite {
 namespace profile {
@@ -45,59 +51,195 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {

 class PrecisionProfiler {
 public:
-  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
-  ~PrecisionProfiler() {
-    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
-              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
-              << PrecisionToStr(inst_->kernel()->precision());
-    auto tensor_mean = [](const Tensor* in,
-                          PrecisionType ptype,
-                          std::string name = "inst") -> double {
-      if (!in->data<int8_t>()) {
-        return -99999;
-      }
-      double sum = 0.;
-      switch (ptype) {
+  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
+  // keep this method only for arm/math/conditional
+  explicit PrecisionProfiler(const Instruction* inst) {
+    std::string inst_precison_str = GetInstPrecision(inst);
+  }
+
+  PrecisionProfiler() {}
+
+  std::string GetSummaryHeader() {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    ss << "========================================= "
+       << "Detailed Precision Profiler Summary "
+       << "=========================================" << std::endl;
+    ss << setw(45) << left << "operator:(kernel_info)"
+       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
+       << " " << setw(15) << left << "tensor_dims"
+       << " " << setw(15) << left << "tensor_mean"
+       << " " << setw(15) << left << "tensor_standard_deviation" << std::endl;
+
+    return ss.str();
+  }
+
+  template <typename T>
+  double compute_mean(const T* in, const size_t length) {
+    double sum = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      sum += in[i];
+    }
+    return sum / length;
+  }
+
+  template <typename T>
+  double compute_standard_deviation(const T* in,
+                                    const size_t length,
+                                    bool has_mean = false,
+                                    double mean = 10000) {
+    if (!has_mean) {
+      mean = compute_mean<T>(in, length);
+    }
+
+    double variance = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      variance += pow((in[i] - mean), 2);
+    }
+    variance /= length;
+    return sqrt(variance);
+  }
+
+  // check if output tensor unused
+  bool is_unused(const Tensor* in) {
+    if (!in->data<int8_t>()) {
+      return true;
+    }
+    return false;
+  }
+
+  void compute_tensor_precision_info(const Tensor* in,
+                                     TargetType target_type,
+                                     PrecisionType precision_type,
+                                     DataLayoutType layout_type,
+                                     double* mean,
+                                     double* std_dev,
+                                     std::string name = "inst") {
+    std::string unsupported_error_log =
+        "Unsupported precision profile for kernel registered on" +
+        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
+        DataLayoutToStr(layout_type);
+
+    if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
+        target_type == TARGET(kX86)) {
+      switch (precision_type) {
        case PRECISION(kFloat): {
          auto ptr = in->data<float>();
          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          return;
        }
        case PRECISION(kAny): {
          auto ptr = in->data<float>();
          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          return;
        }
        case PRECISION(kInt8): {
          auto ptr = in->data<int8_t>();
          // write_tensorfile<int8_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int8_t>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
+          return;
        }
        case PRECISION(kInt32): {
          auto ptr = in->data<int32_t>();
          // write_tensorfile<int32_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int32_t>(ptr, in->numel());
+          *std_dev = compute_standard_deviation<int32_t>(
+              ptr, in->numel(), true, *mean);
+          return;
+        }
+        default:
+          *mean = -333333333333;
+          *std_dev = -33333333333;
+          LOG(ERROR) << unsupported_error_log;
+          return;
+      }
+#ifdef LITE_WITH_OPENCL
+    } else if (target_type == TARGET(kOpenCL)) {
+      switch (layout_type) {
+        case DATALAYOUT(kImageDefault): {
+          paddle::lite::CLImageConverterDefault default_convertor;
+          auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
+          size_t im_w = image_shape[0];
+          size_t im_h = image_shape[1];
+          VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
+                  << im_h;
+          std::vector<uint16_t> in_data_v(im_w * im_h * 4);
+          std::vector<float> real_out_v(in->numel());
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          TargetWrapperCL::ImgcpySync(in_data_v.data(),
+                                      in->data<uint16_t, cl::Image2D>(),
+                                      im_w,
+                                      im_h,
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          default_convertor.ImageToNCHW(
+              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
+          // write_tensorfile<float>(in, name);
+          CHECK(real_out_v.size() == in->numel());
+          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
+          *std_dev = compute_standard_deviation<float>(
+              real_out_v.data(), in->numel(), true, *mean);
+          return;
+        }
+        case DATALAYOUT(kNCHW): {
+          std::vector<float> in_data_v(in->numel(), 0);
+          TargetWrapperCL::MemcpySync(in_data_v.data(),
+                                      in->data<float>(),
+                                      in->numel() * sizeof(float),
+                                      IoDirection::DtoH);
+          VLOG(1) << name << ":" << in->numel();
+          *mean = compute_mean<float>(in_data_v.data(), in->numel());
+          *std_dev = compute_standard_deviation<float>(
+              in_data_v.data(), in->numel(), true, *mean);
+          return;
        }
        default:
-          LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
-          return 0.;
+          *mean = -222222222222;
+          *std_dev = -22222222222;
+          LOG(ERROR) << unsupported_error_log;
+          return;
      }
-    };
-    if (inst_->op()->op_info()->Type() != "fetch") {
-      auto op = const_cast<lite::OpLite*>(inst_->op());
-      auto kernel = inst_->kernel();
+#endif
+    } else {
+      *mean = -111111111111;
+      *std_dev = -11111111111;
+      LOG(ERROR) << unsupported_error_log;
+      return;
+    }
+  }
+
+  std::string GetInstPrecision(const Instruction* inst = nullptr) {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+
+    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
+            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
+            << PrecisionToStr(inst->kernel()->precision()) << "/"
+            << DataLayoutToStr(inst->kernel()->layout());
+
+    std::string kernel_repr = inst->op()->op_info()->Repr();
+    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
+                               PrecisionToStr(inst->kernel()->precision()) +
+                               "/" + DataLayoutToStr(inst->kernel()->layout());
+    std::string op_name = inst->op()->op_info()->Type();
+
+    if (inst->op()->op_info()->Type() != "fetch") {
+      auto op = const_cast<lite::OpLite*>(inst->op());
+      auto kernel = inst->kernel();
      auto op_scope = op->scope();
      auto out_names = op->op_info()->output_names();
      for (auto& out_name : out_names) {
@@ -106,32 +248,78 @@ class PrecisionProfiler {
        auto type = kernel->GetOutputDeclType(out_arg_name);

        if (type->IsTensor()) {
-          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          double mean = tensor_mean(tout, type->precision(), out_name);
-          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
-                    << ", precision: " << PrecisionToStr(type->precision())
-                    << ", mean value: " << mean << " shape:" << tout->dims();
+          const Tensor* tout =
+              op_scope->FindVar(out_name)->GetMutable<Tensor>();
+          double mean = -999999;
+          double std_dev = -100000;
+          std::string mean_str{"unused"};
+          std::string std_dev_str{"unused"};
+
+          if (!is_unused(tout)) {
+            compute_tensor_precision_info(tout,
+                                          type->target(),
+                                          type->precision(),
+                                          type->layout(),
+                                          &mean,
+                                          &std_dev,
+                                          out_name);
+            mean_str = std::to_string(mean);
+            std_dev_str = std::to_string(std_dev);
+          }
+          std::string kernel_info = op_name + ":" + kernel_place;
+          std::string output_arg_info = out_name + ":" +
+                                        TargetToStr(type->target()) + "/" +
+                                        PrecisionToStr(type->precision()) +
+                                        "/" + DataLayoutToStr(type->layout());
+
+          ss << setw(45) << left << kernel_info << " " << setw(70) << left
+             << output_arg_info << " " << setw(15) << left << tout->dims()
+             << " " << setw(15) << left << mean_str << " " << setw(15) << left
+             << std_dev_str << std::endl;
        } else if (type->IsTensorList()) {
-          auto tout =
+          auto touts =
              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-          for (auto& t : *tout) {
-            double mean = tensor_mean(&t, type->precision(), out_name);
-            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
-                      << ", precision: " << PrecisionToStr(type->precision())
-                      << ", mean value: " << mean;
+          for (auto t : *touts) {
+            const Tensor* tout = &t;
+            double mean = -999999;
+            double std_dev = -100000;
+            std::string mean_str{"unused"};
+            std::string std_dev_str{"unused"};
+
+            if (!is_unused(tout)) {
+              compute_tensor_precision_info(tout,
+                                            type->target(),
+                                            type->precision(),
+                                            type->layout(),
+                                            &mean,
+                                            &std_dev,
+                                            out_name);
+              mean_str = std::to_string(mean);
+              std_dev_str = std::to_string(std_dev);
+            }
+            std::string kernel_info = op_name + ":" + kernel_place;
+            std::string output_arg_info = out_name + ":" +
+                                          TargetToStr(type->target()) + "/" +
+                                          PrecisionToStr(type->precision()) +
+                                          "/" + DataLayoutToStr(type->layout());
+
+            ss << setw(45) << left << kernel_info << " " << setw(70) << left
+               << output_arg_info << " " << setw(15) << left << tout->dims()
+               << " " << setw(15) << left << mean_str << " " << setw(15) << left
+               << std_dev_str << std::endl;
          }
        }
      }
    }
+    return ss.str();
  }
-
- private:
-  const Instruction* inst_{nullptr};
 };

 }  // namespace profile
 }  // namespace lite
 }  // namespace paddle

+// TODO(ysh329): need to remove.
+// keep this method only for arm/math/conditional_block_compute
 #define LITE_PRECISION_PROFILE(inst) \
  { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }

 void RuntimeProgram::Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
+  std::string precision_profiler_summary =
+      inst_precision_profiler.GetSummaryHeader();
+#endif
+#endif
+
  for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
-    LITE_PRECISION_PROFILE(inst)
+    precision_profiler_summary +=
+        inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
  }
 #ifdef LITE_WITH_PROFILE
  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#ifdef LITE_WITH_PRECISION_PROFILE
+  LOG(INFO) << "\n" << precision_profiler_summary;
+#endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
 }


--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
 #             DEPS conv_opencl op_registry program context)

+#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
+#             DEPS tensor cl_context cl_wrapper cl_target_wrapper)
+
 #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
 #             DEPS depthwise_conv2d_opencl op_registry program context)


--- a/lite/backends/opencl/cl_im2col_test.cc
+++ b/lite/backends/opencl/cl_im2col_test.cc