precision_profiler.h 21.6 KB
Newer Older
Y
Yan Chunwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/*
 * This file implements BasicProfile, a profiler that helps to profile the basic
 * CPU execution. It can display the min, max, average lantency of the execution
 * of each kernel.
 */
#pragma once
21 22 23 24

#include <sys/time.h>
#include <time.h>

25
#include <cmath>
26
#include <memory>
Y
Yan Chunwei 已提交
27 28 29
#include <string>
#include <vector>
#include "lite/core/program.h"
30
#include "lite/utils/io.h"
31
#ifdef LITE_WITH_X86
32
#include "lite/fluid/float16.h"
33
#endif
Y
Yan Chunwei 已提交
34

35 36 37 38 39 40
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/kernels/opencl/image_helper.h"
#endif

41 42 43 44
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/math/type_trans.h"
#endif

Y
Yan Chunwei 已提交
45 46 47 48
namespace paddle {
namespace lite {
namespace profile {

49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
static const std::string get_date_str() {
  struct tm tm_time;
  time_t timestamp = time(NULL);
  localtime_r(&timestamp, &tm_time);
  struct timeval tv;
  gettimeofday(&tv, NULL);

  // print date / time
  std::string date_str =
      std::to_string(1900 + tm_time.tm_year) +
      std::to_string(1 + tm_time.tm_mon) + std::to_string(tm_time.tm_mday) +
      '_' + std::to_string(tm_time.tm_hour) + std::to_string(tm_time.tm_min) +
      std::to_string(tm_time.tm_sec) + '_' + std::to_string(tv.tv_usec / 1000);
  return date_str;
}

inline std::string generate_valid_tensor_name(const std::string& name) {
  std::string new_name("");
  for (size_t i = 0; i < name.length(); ++i) {
    if (name[i] != '/') {
      new_name += name[i];
    } else {
      new_name += "_";
    }
  }
  return new_name;
}

T
TianXiaogang 已提交
77
template <typename dtype>
78 79 80 81 82 83 84 85 86
static bool write_tensorfile(
    const Tensor* tensor,
    const std::string& tensor_name,
    const std::string prefix_path = "/storage/emulated/0/") {
  std::string new_tensor_name = generate_valid_tensor_name(tensor_name);
  if (tensor_name.find('/') != std::string::npos) {
    LOG(ERROR) << "--> tensor name is abnormal with '\\':" << tensor_name
               << " !!!, replace with '_'," << new_tensor_name
               << new_tensor_name;
T
TianXiaogang 已提交
87
  }
88 89 90

  std::string tensor_save_path = prefix_path + new_tensor_name + ".txt";
  FILE* fp = fopen(tensor_save_path.c_str(), "w");
T
TianXiaogang 已提交
91
  if (fp == nullptr) {
92
    LOG(ERROR) << "failed open file " << tensor_save_path;
93
    return false;
T
TianXiaogang 已提交
94 95 96 97 98 99 100
  } else {
    const dtype* data = tensor->data<dtype>();
    for (int i = 0; i < tensor->numel(); ++i) {
      fprintf(fp, "[%d] %f \n", i, static_cast<float>(data[i]));
    }
  }
  fclose(fp);
101 102
  LOG(INFO) << "write tensor " << tensor_name
            << " to file:" << tensor_save_path;
103
  return true;
T
TianXiaogang 已提交
104 105
}

106 107 108 109 110 111
static bool write_precision_summary_tofile(
    const std::string& string, const std::string& summary_log_dir = "") {
  if (summary_log_dir == "") {
    LOG(INFO) << "The `summary_log_dir` of precision summary file is not set. "
                 "summary_log_dir:"
              << summary_log_dir;
112 113
    return false;
  }
114 115

  FILE* fp = fopen(summary_log_dir.c_str(), "a");
116
  if (fp == nullptr) {
117
    LOG(INFO) << "Open precision summary file:" << summary_log_dir << "failed.";
118 119 120 121 122 123 124 125
    return false;
  } else {
    fprintf(fp, "%s\n", string.c_str());
  }
  fclose(fp);
  return true;
}

Y
Yan Chunwei 已提交
126 127
class PrecisionProfiler {
 public:
128 129 130 131 132 133
  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
  // keep this method only for arm/math/conditional
  explicit PrecisionProfiler(const Instruction* inst) {
    std::string inst_precison_str = GetInstPrecision(inst);
  }

134
  PrecisionProfiler() { MkDirRecur(log_dir_); }
135 136 137 138 139 140

  std::string GetSummaryHeader() {
    using std::setw;
    using std::left;
    using std::fixed;
    STL::stringstream ss;
141
    ss << "\n\n========================================= "
142 143 144 145
       << "Detailed Precision Profiler Summary "
       << "=========================================" << std::endl;
    ss << setw(45) << left << "operator:(kernel_info)"
       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
146 147 148 149
       << " " << setw(15) << left << "dims"
       << " " << setw(15) << left << "mean"
       << " " << setw(15) << left << "std_deviation"
       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
150

151 152 153
    // write to file with path: `summary_log_dir`
    if (summary_log_dir_ != "") {
      FILE* fp = fopen(summary_log_dir_.c_str(), "a");
154 155 156 157
      std::string header_str{ss.str()};
      fprintf(fp, "%s\n", header_str.c_str());
      fclose(fp);
    }
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
    return ss.str();
  }

  template <typename T>
  double compute_mean(const T* in, const size_t length) {
    double sum = 0.;
    for (size_t i = 0; i < length; ++i) {
      sum += in[i];
    }
    return sum / length;
  }

  template <typename T>
  double compute_standard_deviation(const T* in,
                                    const size_t length,
                                    bool has_mean = false,
                                    double mean = 10000) {
    if (!has_mean) {
      mean = compute_mean<T>(in, length);
    }

    double variance = 0.;
    for (size_t i = 0; i < length; ++i) {
      variance += pow((in[i] - mean), 2);
    }
    variance /= length;
    return sqrt(variance);
  }

187 188 189 190 191 192 193 194 195 196 197
  template <typename T>
  double compute_average_grow_rate(const T* in, const size_t length) {
    const double eps = 1e-5;
    double ave_grow_rate = 0.0f;
    for (size_t i = 1; i < length; ++i) {
      ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
    }
    ave_grow_rate /= length;
    return ave_grow_rate;
  }

198 199 200 201 202 203 204 205 206 207 208 209 210 211
  // check if output tensor unused
  bool is_unused(const Tensor* in) {
    if (!in->data<int8_t>()) {
      return true;
    }
    return false;
  }

  void compute_tensor_precision_info(const Tensor* in,
                                     TargetType target_type,
                                     PrecisionType precision_type,
                                     DataLayoutType layout_type,
                                     double* mean,
                                     double* std_dev,
212 213 214
                                     double* ave_grow_rate,
                                     std::string name = "inst",
                                     bool write_result_to_file = false) {
215 216 217 218 219 220 221 222
    std::string unsupported_error_log =
        "Unsupported precision profile for kernel registered on" +
        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
        DataLayoutToStr(layout_type);

    if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
        target_type == TARGET(kX86)) {
      switch (precision_type) {
Y
Yan Chunwei 已提交
223 224
        case PRECISION(kFloat): {
          auto ptr = in->data<float>();
225 226 227
          *mean = compute_mean<float>(ptr, in->numel());
          *std_dev =
              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
228
          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
229
          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
230
          return;
T
TianXiaogang 已提交
231 232 233
        }
        case PRECISION(kAny): {
          auto ptr = in->data<float>();
234 235 236
          *mean = compute_mean<float>(ptr, in->numel());
          *std_dev =
              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
237
          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
238
          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
239
          return;
Y
Yan Chunwei 已提交
240 241 242
        }
        case PRECISION(kInt8): {
          auto ptr = in->data<int8_t>();
243 244 245
          *mean = compute_mean<int8_t>(ptr, in->numel());
          *std_dev =
              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
246
          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
247
          write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
248
          return;
Y
Yan Chunwei 已提交
249 250 251
        }
        case PRECISION(kInt32): {
          auto ptr = in->data<int32_t>();
252 253 254
          *mean = compute_mean<int32_t>(ptr, in->numel());
          *std_dev = compute_standard_deviation<int32_t>(
              ptr, in->numel(), true, *mean);
255
          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
256
          write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
257 258
          return;
        }
259 260 261 262 263 264 265
        case PRECISION(kInt64): {
          auto ptr = in->data<int64_t>();
          *mean = compute_mean<int64_t>(ptr, in->numel());
          *std_dev = compute_standard_deviation<int64_t>(
              ptr, in->numel(), true, *mean);
          return;
        }
266 267 268
        default:
          *mean = -333333333333;
          *std_dev = -33333333333;
269
          *ave_grow_rate = -33333333333;
270 271 272 273 274
          LOG(ERROR) << unsupported_error_log;
          return;
      }
#ifdef LITE_WITH_OPENCL
    } else if (target_type == TARGET(kOpenCL)) {
275
      CLRuntime::Global()->command_queue().finish();
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
      switch (layout_type) {
        case DATALAYOUT(kImageDefault): {
          paddle::lite::CLImageConverterDefault default_convertor;
          auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
          size_t im_w = image_shape[0];
          size_t im_h = image_shape[1];
          VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
                  << im_h;
          std::vector<uint16_t> in_data_v(im_w * im_h * 4);
          std::vector<float> real_out_v(in->numel());
          const size_t cl_image2d_row_pitch{0};
          const size_t cl_image2d_slice_pitch{0};
          TargetWrapperCL::ImgcpySync(in_data_v.data(),
                                      in->data<uint16_t, cl::Image2D>(),
                                      im_w,
                                      im_h,
                                      cl_image2d_row_pitch,
                                      cl_image2d_slice_pitch,
                                      IoDirection::DtoH);
          default_convertor.ImageToNCHW(
              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
          CHECK(real_out_v.size() == in->numel());
          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
          *std_dev = compute_standard_deviation<float>(
              real_out_v.data(), in->numel(), true, *mean);
301 302
          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
                                                            real_out_v.size());
303 304 305 306 307 308 309 310
          std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
          real_out_t->Resize(in->dims());
          float* real_out_data = real_out_t->mutable_data<float>();
          memcpy(real_out_data,
                 real_out_v.data(),
                 real_out_v.size() * sizeof(float));
          write_result_to_file&& write_tensorfile<float>(
              real_out_t.get(), name, log_dir_);
311 312 313 314 315 316 317 318 319 320
          return;
        }
        case DATALAYOUT(kNCHW): {
          std::vector<float> in_data_v(in->numel(), 0);
          TargetWrapperCL::MemcpySync(in_data_v.data(),
                                      in->data<float>(),
                                      in->numel() * sizeof(float),
                                      IoDirection::DtoH);
          VLOG(1) << name << ":" << in->numel();
          *mean = compute_mean<float>(in_data_v.data(), in->numel());
321 322 323 324
          *std_dev = compute_standard_deviation<float>(
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
325 326 327 328 329 330 331 332
          std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
          real_out_t->Resize(in->dims());
          float* real_out_data = real_out_t->mutable_data<float>();
          memcpy(real_out_data,
                 in_data_v.data(),
                 in_data_v.size() * sizeof(float));
          write_result_to_file&& write_tensorfile<float>(
              real_out_t.get(), name, log_dir_);
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
          return;
        }
        default:
          *mean = -222222222222;
          *std_dev = -22222222222;
          *ave_grow_rate = -22222222222;
          LOG(ERROR) << unsupported_error_log;
          return;
      }
#endif
#ifdef LITE_WITH_CUDA
    } else if (target_type == TARGET(kCUDA)) {
      switch (precision_type) {
        case PRECISION(kAny):
        case PRECISION(kFloat): {
          std::vector<float> in_data_v(in->numel(), 0);
          TargetWrapperCuda::MemcpySync(in_data_v.data(),
                                        in->data<float>(),
                                        in->numel() * sizeof(float),
                                        IoDirection::DtoH);
          VLOG(1) << name << ":" << in->numel();
          *mean = compute_mean<float>(in_data_v.data(), in->numel());
          *std_dev = compute_standard_deviation<float>(
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
359
          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
360 361 362 363 364 365 366 367 368 369 370 371 372 373
          return;
        }
        case PRECISION(kInt32): {
          std::vector<int> in_data_v(in->numel(), 0);
          TargetWrapperCuda::MemcpySync(in_data_v.data(),
                                        in->data<int>(),
                                        in->numel() * sizeof(int),
                                        IoDirection::DtoH);
          VLOG(1) << name << ":" << in->numel();
          *mean = compute_mean<int>(in_data_v.data(), in->numel());
          *std_dev = compute_standard_deviation<int>(
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<int>(in_data_v.data(), in->numel());
374
          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
375 376 377 378 379 380 381 382 383 384 385 386 387 388
          return;
        }
        case PRECISION(kInt64): {
          std::vector<int64_t> in_data_v(in->numel(), 0);
          TargetWrapperCuda::MemcpySync(in_data_v.data(),
                                        in->data<int64_t>(),
                                        in->numel() * sizeof(int64_t),
                                        IoDirection::DtoH);
          VLOG(1) << name << ":" << in->numel();
          *mean = compute_mean<int64_t>(in_data_v.data(), in->numel());
          *std_dev = compute_standard_deviation<int64_t>(
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
389
          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
          return;
        }
        case PRECISION(kFP16): {
          std::vector<float> in_data_v(in->numel(), 0);
          lite::Tensor fp32_tensor;
          fp32_tensor.Resize(in->dims());
          lite::cuda::math::fp16_to_fp32(
              in->numel(),
              in->data<half>(),
              fp32_tensor.mutable_data<float>(TARGET(kCUDA)));
          TargetWrapperCuda::MemcpySync(in_data_v.data(),
                                        fp32_tensor.data<float>(),
                                        in->numel() * sizeof(float),
                                        IoDirection::DtoH);
          VLOG(1) << name << ":" << in->numel();
          *mean = compute_mean<float>(in_data_v.data(), in->numel());
406 407
          *std_dev = compute_standard_deviation<float>(
              in_data_v.data(), in->numel(), true, *mean);
408 409
          *ave_grow_rate =
              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
410
          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
411
          return;
Y
Yan Chunwei 已提交
412 413
        }
        default:
414 415
          *mean = -222222222222;
          *std_dev = -22222222222;
416
          *ave_grow_rate = -22222222222;
417 418
          LOG(ERROR) << unsupported_error_log;
          return;
Y
Yan Chunwei 已提交
419
      }
420 421 422 423
#endif
    } else {
      *mean = -111111111111;
      *std_dev = -11111111111;
424
      *ave_grow_rate = -11111111111;
425 426 427 428 429 430 431 432 433 434
      LOG(ERROR) << unsupported_error_log;
      return;
    }
  }

  std::string GetInstPrecision(const Instruction* inst = nullptr) {
    using std::setw;
    using std::left;
    using std::fixed;
    STL::stringstream ss;
435
    bool write_result_to_file = true;
436 437 438 439

    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
            << PrecisionToStr(inst->kernel()->precision()) << "/"
440 441
            << DataLayoutToStr(inst->kernel()->layout())
            << ", write_result_to_file:" << write_result_to_file;
442 443 444 445 446 447 448 449 450 451

    std::string kernel_repr = inst->op()->op_info()->Repr();
    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
                               PrecisionToStr(inst->kernel()->precision()) +
                               "/" + DataLayoutToStr(inst->kernel()->layout());
    std::string op_name = inst->op()->op_info()->Type();

    if (inst->op()->op_info()->Type() != "fetch") {
      auto op = const_cast<lite::OpLite*>(inst->op());
      auto kernel = inst->kernel();
Y
Yan Chunwei 已提交
452 453 454 455 456 457
      auto op_scope = op->scope();
      auto out_names = op->op_info()->output_names();
      for (auto& out_name : out_names) {
        std::string out_arg_name;
        op->op_info()->GetOutputArgname(out_name, &out_arg_name);
        auto type = kernel->GetOutputDeclType(out_arg_name);
T
TianXiaogang 已提交
458

Y
Yan Chunwei 已提交
459
        if (type->IsTensor()) {
460 461 462 463
          const Tensor* tout =
              op_scope->FindVar(out_name)->GetMutable<Tensor>();
          double mean = -999999;
          double std_dev = -100000;
464
          double ave_grow_rate = 99999;
465 466
          std::string mean_str{"unused"};
          std::string std_dev_str{"unused"};
467
          std::string ave_grow_rate_str{"unused"};
468 469 470 471 472 473 474 475

          if (!is_unused(tout)) {
            compute_tensor_precision_info(tout,
                                          type->target(),
                                          type->precision(),
                                          type->layout(),
                                          &mean,
                                          &std_dev,
476 477 478 479 480 481
                                          &ave_grow_rate,
                                          out_name,
                                          write_result_to_file);
            mean_str = std::to_string(mean);
            std_dev_str = std::to_string(std_dev);
            ave_grow_rate_str = std::to_string(ave_grow_rate);
482 483 484 485 486 487 488 489 490 491
          }
          std::string kernel_info = op_name + ":" + kernel_place;
          std::string output_arg_info = out_name + ":" +
                                        TargetToStr(type->target()) + "/" +
                                        PrecisionToStr(type->precision()) +
                                        "/" + DataLayoutToStr(type->layout());

          ss << setw(45) << left << kernel_info << " " << setw(70) << left
             << output_arg_info << " " << setw(15) << left << tout->dims()
             << " " << setw(15) << left << mean_str << " " << setw(15) << left
492 493
             << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
             << std::endl;
Y
Yan Chunwei 已提交
494
        } else if (type->IsTensorList()) {
495
          auto touts =
Y
Yan Chunwei 已提交
496
              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
497 498 499 500
          for (auto t : *touts) {
            const Tensor* tout = &t;
            double mean = -999999;
            double std_dev = -100000;
501
            double ave_grow_rate = 99999;
502 503
            std::string mean_str{"unused"};
            std::string std_dev_str{"unused"};
504
            std::string ave_grow_rate_str{"unused"};
505 506 507 508 509 510 511 512

            if (!is_unused(tout)) {
              compute_tensor_precision_info(tout,
                                            type->target(),
                                            type->precision(),
                                            type->layout(),
                                            &mean,
                                            &std_dev,
513 514 515 516 517 518
                                            &ave_grow_rate,
                                            out_name,
                                            write_result_to_file);
              mean_str = std::to_string(mean);
              std_dev_str = std::to_string(std_dev);
              ave_grow_rate_str = std::to_string(ave_grow_rate);
519 520 521 522 523 524 525 526 527 528
            }
            std::string kernel_info = op_name + ":" + kernel_place;
            std::string output_arg_info = out_name + ":" +
                                          TargetToStr(type->target()) + "/" +
                                          PrecisionToStr(type->precision()) +
                                          "/" + DataLayoutToStr(type->layout());

            ss << setw(45) << left << kernel_info << " " << setw(70) << left
               << output_arg_info << " " << setw(15) << left << tout->dims()
               << " " << setw(15) << left << mean_str << " " << setw(15) << left
529 530
               << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
               << std::endl;
Y
Yan Chunwei 已提交
531 532 533 534
          }
        }
      }
    }
535
    write_precision_summary_tofile(ss.str(), summary_log_dir_);
536
    return ss.str();
Y
Yan Chunwei 已提交
537
  }
538 539

 private:
540 541 542
  std::string log_dir_{"/storage/emulated/0/PaddleLite_" + get_date_str() +
                       "/"};
  std::string summary_log_dir_{log_dir_ + "precision_summary.log"};
Y
Yan Chunwei 已提交
543 544 545 546 547 548
};

}  // namespace profile
}  // namespace lite
}  // namespace paddle

549 550
// TODO(ysh329): need to remove.
// keep this method only for arm/math/conditional_block_compute
Y
Yan Chunwei 已提交
551 552
#define LITE_PRECISION_PROFILE(inst) \
  { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }