tester_helper.h 32.7 KB
Newer Older
L
luotao1 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <gtest/gtest.h>
Y
Yan Chunwei 已提交
18

L
luotao1 已提交
19
#include <algorithm>
L
luotao1 已提交
20
#include <memory>
T
Tao Luo 已提交
21
#include <string>
L
luotao1 已提交
22
#include <thread>  // NOLINT
L
luotao1 已提交
23
#include <unordered_map>
L
luotao1 已提交
24
#include <vector>
Y
Yiqun Liu 已提交
25 26 27
#ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h>
#endif
L
luotao1 已提交
28
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
29
#include "paddle/fluid/framework/scope.h"
L
luotao1 已提交
30 31 32
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
33
#include "paddle/fluid/inference/api/helper.h"
Y
Yan Chunwei 已提交
34
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
35
#include "paddle/fluid/inference/tests/api/config_printer.h"
T
Tao Luo 已提交
36
#include "paddle/fluid/inference/tests/test_helper.h"
N
nhzlx 已提交
37
#include "paddle/fluid/inference/utils/benchmark.h"
L
luotao1 已提交
38 39
#include "paddle/fluid/platform/profiler.h"

N
nhzlx 已提交
40
DEFINE_string(model_name, "", "model name");
L
luotao1 已提交
41
DEFINE_string(infer_model, "", "model path");
42 43
DEFINE_string(fp32_model, "", "FP32 model path");
DEFINE_string(int8_model, "", "INT8 model path");
L
luotao1 已提交
44
DEFINE_string(infer_data, "", "data file");
T
Tao Luo 已提交
45
DEFINE_string(refer_result, "", "reference result for comparison");
46
DEFINE_int32(batch_size, 1, "batch size");
47
DEFINE_bool(ernie_large, false, "Test ernie large");
48 49
DEFINE_bool(with_accuracy_layer, true,
            "Calculate the accuracy while label is in the input");
50 51
DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
52 53 54
DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
// setting iterations to 0 means processing the whole dataset
DEFINE_int32(iterations, 0, "number of batches to process");
L
luotao1 已提交
55 56 57
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
T
Tao Luo 已提交
58 59
DEFINE_bool(use_analysis, true,
            "Running the inference program in analysis mode.");
N
nhzlx 已提交
60 61
DEFINE_bool(record_benchmark, false,
            "Record benchmark after profiling the model");
L
luotao1 已提交
62
DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
63
DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
L
luotao1 已提交
64
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
65 66 67
DEFINE_bool(warmup, false,
            "Use warmup to calculate elapsed_time more accurately. "
            "To reduce CI time, it sets false in default.");
L
luotao1 已提交
68

69
DECLARE_bool(profile);
L
luotao1 已提交
70
DECLARE_int32(paddle_num_threads);
71

L
luotao1 已提交
72 73 74
namespace paddle {
namespace inference {

75 76
using paddle::framework::proto::VarType;

77 78 79 80 81 82 83 84 85 86 87 88 89
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();

template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
  return paddle::PaddleDType::INT64;
}

template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
  return paddle::PaddleDType::FLOAT32;
}

90
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
91
  const auto *analysis_config =
92
      reinterpret_cast<const AnalysisConfig *>(config);
93
  if (use_analysis) {
94
    LOG(INFO) << *analysis_config;
95 96
    return;
  }
97
  LOG(INFO) << analysis_config->ToNativeConfig();
98
}
Y
Yan Chunwei 已提交
99

100 101 102 103 104 105 106 107
void CheckError(float data_ref, float data) {
  if (std::abs(data_ref) > 1) {
    CHECK_LE(std::abs((data_ref - data) / data_ref), FLAGS_accuracy);
  } else {
    CHECK_LE(std::abs(data_ref - data), FLAGS_accuracy);
  }
}

108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
class Barrier {
 public:
  explicit Barrier(std::size_t count) : _count(count) {}
  void Wait() {
    std::unique_lock<std::mutex> lock(_mutex);
    if (--_count) {
      _cv.wait(lock, [this] { return _count == 0; });
    } else {
      _cv.notify_all();
    }
  }

 private:
  std::mutex _mutex;
  std::condition_variable _cv;
  std::size_t _count;
};

126
// Compare result between two PaddleTensor
L
luotao1 已提交
127
void CompareResult(const std::vector<PaddleTensor> &outputs,
T
tensor-tang 已提交
128
                   const std::vector<PaddleTensor> &ref_outputs) {
T
Tao Luo 已提交
129
  EXPECT_GT(outputs.size(), 0UL);
T
tensor-tang 已提交
130
  EXPECT_EQ(outputs.size(), ref_outputs.size());
L
luotao1 已提交
131 132
  for (size_t i = 0; i < outputs.size(); i++) {
    auto &out = outputs[i];
T
tensor-tang 已提交
133
    auto &ref_out = ref_outputs[i];
134 135
    size_t size = VecReduceToInt(out.shape);
    size_t ref_size = VecReduceToInt(ref_out.shape);
136
    EXPECT_GT(size, 0UL);
T
tensor-tang 已提交
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
    EXPECT_EQ(size, ref_size);
    EXPECT_EQ(out.dtype, ref_out.dtype);
    switch (out.dtype) {
      case PaddleDType::INT64: {
        int64_t *pdata = static_cast<int64_t *>(out.data.data());
        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
          EXPECT_EQ(pdata_ref[j], pdata[j]);
        }
        break;
      }
      case PaddleDType::FLOAT32: {
        float *pdata = static_cast<float *>(out.data.data());
        float *pdata_ref = static_cast<float *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
152
          CheckError(pdata_ref[j], pdata[j]);
T
tensor-tang 已提交
153 154 155
        }
        break;
      }
156 157 158 159 160 161 162 163
      case PaddleDType::INT32: {
        int32_t *pdata = static_cast<int32_t *>(out.data.data());
        int32_t *pdata_ref = static_cast<int32_t *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
          EXPECT_EQ(pdata_ref[j], pdata[j]);
        }
        break;
      }
164 165 166 167 168 169 170 171
      case PaddleDType::UINT8: {
        uint8_t *pdata = static_cast<uint8_t *>(out.data.data());
        uint8_t *pdata_ref = static_cast<uint8_t *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
          EXPECT_EQ(pdata_ref[j], pdata[j]);
        }
        break;
      }
L
luotao1 已提交
172 173 174 175
    }
  }
}

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
// Compare result between a PaddleTensor and a ZeroCopyTensor
void CompareResult(const std::vector<PaddleTensor> &outputs,
                   const std::vector<ZeroCopyTensor> &ref_outputs) {
  EXPECT_GT(outputs.size(), 0UL);
  EXPECT_EQ(outputs.size(), ref_outputs.size());
  for (size_t i = 0; i < outputs.size(); i++) {
    auto &out = outputs[i];
    auto &ref_out = ref_outputs[i];
    size_t size = VecReduceToInt(out.shape);
    EXPECT_GT(size, 0UL);
    int ref_size = 0;  // this is the number of elements not memory size
    PaddlePlace place;
    switch (out.dtype) {
      case PaddleDType::INT64: {
        int64_t *pdata = static_cast<int64_t *>(out.data.data());
        int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
192
        EXPECT_EQ(size, static_cast<size_t>(ref_size));
193 194 195 196 197 198 199 200
        for (size_t j = 0; j < size; ++j) {
          EXPECT_EQ(pdata_ref[j], pdata[j]);
        }
        break;
      }
      case PaddleDType::FLOAT32: {
        float *pdata = static_cast<float *>(out.data.data());
        float *pdata_ref = ref_out.data<float>(&place, &ref_size);
201
        EXPECT_EQ(size, static_cast<size_t>(ref_size));
202
        for (size_t j = 0; j < size; ++j) {
203
          CheckError(pdata_ref[j], pdata[j]);
204 205 206
        }
        break;
      }
L
luotao1 已提交
207 208 209
      case PaddleDType::INT32: {
        int32_t *pdata = static_cast<int32_t *>(out.data.data());
        int32_t *pdata_ref = ref_out.data<int32_t>(&place, &ref_size);
210
        EXPECT_EQ(size, static_cast<size_t>(ref_size));
L
luotao1 已提交
211 212 213 214 215
        for (size_t j = 0; j < size; ++j) {
          EXPECT_EQ(pdata_ref[j], pdata[j]);
        }
        break;
      }
216 217 218
      case PaddleDType::UINT8: {
        uint8_t *pdata = static_cast<uint8_t *>(out.data.data());
        uint8_t *pdata_ref = ref_out.data<uint8_t>(&place, &ref_size);
219
        EXPECT_EQ(size, static_cast<size_t>(ref_size));
220 221 222 223 224
        for (size_t j = 0; j < size; ++j) {
          EXPECT_EQ(pdata_ref[j], pdata[j]);
        }
        break;
      }
225 226 227 228
    }
  }
}

229
std::unique_ptr<PaddlePredictor> CreateTestPredictor(
230
    const PaddlePredictor::Config *config, bool use_analysis = true) {
231
  const auto *analysis_config =
232
      reinterpret_cast<const AnalysisConfig *>(config);
T
Tao Luo 已提交
233
  if (use_analysis) {
234
    return CreatePaddlePredictor<AnalysisConfig>(*analysis_config);
T
Tao Luo 已提交
235
  }
236 237
  auto native_config = analysis_config->ToNativeConfig();
  return CreatePaddlePredictor<NativeConfig>(native_config);
T
Tao Luo 已提交
238 239
}

240
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
T
Tao Luo 已提交
241

242
std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
T
Tao Luo 已提交
243
                                                   int *num_ops) {
244
  std::unordered_map<std::string, int> res;
245
  auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
246 247 248 249 250 251
  auto *fusion_status =
      analysis_predictor->analysis_argument().fusion_statis_ptr();
  if (!fusion_status) {
    return res;
  }
  for (auto &item : *fusion_status) {
T
Tao Luo 已提交
252 253 254 255
    LOG(INFO) << "fused " << item.first << " " << item.second;
  }
  int num = 0;
  for (auto &node :
256 257
       analysis_predictor->analysis_argument().main_graph().Nodes()) {
    if (node->IsOp()) {
T
Tao Luo 已提交
258 259 260 261
      ++num;
    }
  }
  *num_ops = num;
262
  return *fusion_status;
T
Tao Luo 已提交
263 264
}

T
Tao Luo 已提交
265
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
266 267
                       const std::string &dirname, bool is_combined = true,
                       std::string model_filename = "model",
T
tensor-tang 已提交
268
                       std::string params_filename = "params",
N
nhzlx 已提交
269 270
                       const std::vector<std::string> *feed_names = nullptr,
                       const int continuous_inuput_index = 0) {
T
Tao Luo 已提交
271
  // Set fake_image_data
272 273 274 275 276
  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0,
                    platform::errors::InvalidArgument(
                        "In SetFakeImageInput, expected test_all_data = false, "
                        "but now test_all_data=",
                        FLAGS_test_all_data));
277 278 279 280 281 282 283 284 285 286 287
  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
      dirname, is_combined, model_filename, params_filename);
  std::ostringstream os;
  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
    os << "feed target " << i << ": {" << feed_target_shapes[i][0];
    for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
      os << ", " << feed_target_shapes[i][j];
    }
    os << "}\n";
  }
  LOG(INFO) << os.str();
T
tensor-tang 已提交
288
  if (feed_names) {
289 290 291 292 293 294 295
    PADDLE_ENFORCE_EQ(
        feed_names->size(), feed_target_shapes.size(),
        platform::errors::InvalidArgument(
            "The size of feeds_names and size of "
            "feed_target_shapes must be equal, but now feeds_names "
            "size is %d and feed_target_shapes size is %d",
            feed_names->size(), feed_target_shapes.size()));
T
tensor-tang 已提交
296 297 298 299 300 301 302 303 304 305 306 307 308 309
  }
  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
    const auto &feed_shape = feed_target_shapes[i];
    auto &input = input_slots[i];
    std::vector<int> shape({FLAGS_batch_size});
    for (size_t s = 1; s < feed_shape.size(); ++s) {
      shape.push_back(static_cast<int>(feed_shape[s]));
    }
    if (feed_names) {
      input.name = (*feed_names)[i];
    }
    input.shape = shape;
    input.dtype = PaddleDType::FLOAT32;
310
    size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1},
T
tensor-tang 已提交
311 312 313 314 315 316
                                 [](int a, int b) { return a * b; });
    input.data.Resize(len * sizeof(float));
    input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
    float *input_data = static_cast<float *>(input.data.data());
    // fill input data, for profile easily, do not use random data here.
    for (size_t j = 0; j < len; ++j) {
N
nhzlx 已提交
317 318
      *(input_data + j) =
          static_cast<float>((j + continuous_inuput_index) % len) / len;
T
tensor-tang 已提交
319
    }
T
Tao Luo 已提交
320 321 322 323
  }
  (*inputs).emplace_back(input_slots);
}

324 325 326 327 328 329 330 331 332 333 334 335
void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
                      std::vector<std::vector<int64_t>> *out,
                      std::vector<size_t> *lod, size_t batch_iter,
                      size_t batch_end) {
  lod->clear();
  lod->push_back(0);
  for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) {
    out->push_back(*it);
    lod->push_back(lod->back() + (*it).size());  // calculate lod
  }
}

L
luotao1 已提交
336 337 338 339 340 341 342 343 344 345 346
void ConvertPaddleTensorToZeroCopyTensor(
    PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
  for (size_t i = 0; i < inputs.size(); i++) {
    auto input = inputs[i];
    auto tensor = predictor->GetInputTensor(input.name);
    tensor->Reshape(input.shape);
    tensor->SetLoD({input.lod});
    if (input.dtype == PaddleDType::INT64) {
      ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
    } else if (input.dtype == PaddleDType::FLOAT32) {
      ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
L
luotao1 已提交
347 348
    } else if (input.dtype == PaddleDType::INT32) {
      ZeroCopyTensorAssignData<int32_t>(tensor.get(), input.data);
349 350
    } else if (input.dtype == PaddleDType::UINT8) {
      ZeroCopyTensorAssignData<uint8_t>(tensor.get(), input.data);
L
luotao1 已提交
351 352 353 354 355
    } else {
      LOG(ERROR) << "unsupported feed type " << input.dtype;
    }
  }
}
356

L
luotao1 已提交
357 358
void PredictionWarmUp(PaddlePredictor *predictor,
                      const std::vector<std::vector<PaddleTensor>> &inputs,
359
                      std::vector<std::vector<PaddleTensor>> *outputs,
360 361
                      int num_threads, int tid,
                      const VarType::Type data_type = VarType::FP32) {
L
luotao1 已提交
362 363 364 365 366
  int batch_size = FLAGS_batch_size;
  LOG(INFO) << "Running thread " << tid << ", warm up run...";
  if (FLAGS_zero_copy) {
    ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
  }
367
  outputs->resize(1);
L
luotao1 已提交
368 369 370
  Timer warmup_timer;
  warmup_timer.tic();
  if (!FLAGS_zero_copy) {
371
    predictor->Run(inputs[0], &(*outputs)[0], batch_size);
L
luotao1 已提交
372 373
  } else {
    predictor->ZeroCopyRun();
374
  }
375
  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type);
L
luotao1 已提交
376 377 378 379
  if (FLAGS_profile) {
    paddle::platform::ResetProfiler();
  }
}
380

L
luotao1 已提交
381 382
void PredictionRun(PaddlePredictor *predictor,
                   const std::vector<std::vector<PaddleTensor>> &inputs,
383
                   std::vector<std::vector<PaddleTensor>> *outputs,
384
                   int num_threads, int tid,
385 386
                   const VarType::Type data_type = VarType::FP32,
                   float *sample_latency = nullptr) {
L
luotao1 已提交
387
  int num_times = FLAGS_repeat;
388
  int iterations = inputs.size();  // process the whole dataset ...
389 390
  if (FLAGS_iterations > 0 &&
      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
391 392 393 394 395
    iterations =
        FLAGS_iterations;  // ... unless the number of iterations is set
  outputs->resize(iterations);
  LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads
            << ", run " << num_times << " times...";
L
luotao1 已提交
396 397
  Timer run_timer;
  double elapsed_time = 0;
Y
Yiqun Liu 已提交
398
#ifdef WITH_GPERFTOOLS
L
luotao1 已提交
399
  ProfilerStart("paddle_inference.prof");
Y
Yiqun Liu 已提交
400
#endif
401
  int predicted_num = 0;
L
luotao1 已提交
402
  if (!FLAGS_zero_copy) {
403
    for (int i = 0; i < iterations; i++) {
404
      run_timer.tic();
L
luotao1 已提交
405
      for (int j = 0; j < num_times; j++) {
406
        predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
407
      }
408 409 410 411 412 413
      elapsed_time += run_timer.toc();

      predicted_num += FLAGS_batch_size;
      if (predicted_num % 100 == 0) {
        LOG(INFO) << predicted_num << " samples";
      }
L
luotao1 已提交
414
    }
L
luotao1 已提交
415
  } else {
416
    for (int i = 0; i < iterations; i++) {
L
luotao1 已提交
417 418 419 420 421 422
      ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
      run_timer.tic();
      for (int j = 0; j < num_times; j++) {
        predictor->ZeroCopyRun();
      }
      elapsed_time += run_timer.toc();
423 424 425 426 427

      predicted_num += FLAGS_batch_size;
      if (predicted_num % 100 == 0) {
        LOG(INFO) << predicted_num << " samples";
      }
L
luotao1 已提交
428 429
    }
  }
430

Y
Yiqun Liu 已提交
431
#ifdef WITH_GPERFTOOLS
L
luotao1 已提交
432
  ProfilerStop();
Y
Yiqun Liu 已提交
433
#endif
N
nhzlx 已提交
434

435 436
  auto batch_latency = elapsed_time / (iterations * num_times);
  PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
437
            iterations, data_type);
438 439 440 441

  if (sample_latency != nullptr)
    *sample_latency = batch_latency / FLAGS_batch_size;

L
luotao1 已提交
442 443 444
  if (FLAGS_record_benchmark) {
    Benchmark benchmark;
    benchmark.SetName(FLAGS_model_name);
445 446
    benchmark.SetBatchSize(FLAGS_batch_size);
    benchmark.SetLatency(batch_latency);
L
luotao1 已提交
447
    benchmark.PersistToFile("benchmark_record.txt");
L
luotao1 已提交
448 449 450
  }
}

L
luotao1 已提交
451 452 453
void TestOneThreadPrediction(
    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
454
    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true,
455 456
    const VarType::Type data_type = VarType::FP32,
    float *sample_latency = nullptr) {
L
luotao1 已提交
457
  auto predictor = CreateTestPredictor(config, use_analysis);
458
  if (FLAGS_warmup) {
459
    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type);
460
  }
461 462
  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type,
                sample_latency);
L
luotao1 已提交
463 464
}

L
luotao1 已提交
465
void TestMultiThreadPrediction(
466
    const PaddlePredictor::Config *config,
467
    const std::vector<std::vector<PaddleTensor>> &inputs,
468
    std::vector<std::vector<PaddleTensor>> *outputs, int num_threads,
T
Tao Luo 已提交
469
    bool use_analysis = true) {
L
luotao1 已提交
470
  std::vector<std::thread> threads;
L
luotao1 已提交
471 472 473 474 475
  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
  for (int tid = 1; tid < num_threads; tid++) {
    predictors.emplace_back(predictors.front()->Clone());
  }
476

L
luotao1 已提交
477 478 479 480
  for (int tid = 0; tid < num_threads; ++tid) {
    threads.emplace_back([&, tid]() {
      // Each thread should have local inputs and outputs.
      // The inputs of each thread are all the same.
481
      std::vector<std::vector<PaddleTensor>> outputs_tid;
L
luotao1 已提交
482
      auto &predictor = predictors[tid];
483 484 485 486
      if (FLAGS_warmup) {
        PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads,
                         tid);
      }
487
      PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
L
luotao1 已提交
488 489 490 491 492 493 494
    });
  }
  for (int i = 0; i < num_threads; ++i) {
    threads[i].join();
  }
}

495
void TestPrediction(const PaddlePredictor::Config *config,
496
                    const std::vector<std::vector<PaddleTensor>> &inputs,
497 498
                    std::vector<std::vector<PaddleTensor>> *outputs,
                    int num_threads, bool use_analysis = FLAGS_use_analysis) {
499
  PrintConfig(config, use_analysis);
L
luotao1 已提交
500
  if (num_threads == 1) {
T
Tao Luo 已提交
501
    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
L
luotao1 已提交
502
  } else {
T
Tao Luo 已提交
503 504
    TestMultiThreadPrediction(config, inputs, outputs, num_threads,
                              use_analysis);
L
luotao1 已提交
505 506 507
  }
}

508 509
void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
                       int compared_idx) {
510 511 512 513 514 515 516 517 518 519 520 521 522 523
  PADDLE_ENFORCE_LE(
      compared_idx, 2,
      platform::errors::InvalidArgument(
          "The compared_idx should be <= 2. But received compared_idx = %d. "
          "For top1 accuracy, set compared_idx = 1; For top5 accuracy or mean "
          "Average Precision (mAP), set compared_idx = 2.",
          compared_idx));
  PADDLE_ENFORCE_GE(
      compared_idx, 1,
      platform::errors::InvalidArgument(
          "The compared_idx should be >= 1. But received compared_idx = %d. "
          "For top1 accuracy, set compared_idx = 1; For top5 accuracy or mean "
          "Average Precision (mAP), set compared_idx = 2.",
          compared_idx));
524
  std::string prefix = (compared_idx == 1) ? "top1_accuracy " : "mAP ";
525
  LOG(INFO) << "--- Accuracy summary --- ";
526 527 528 529 530 531 532 533
  LOG(INFO) << "Accepted " << prefix
            << "drop threshold: " << FLAGS_quantized_accuracy
            << ". (condition: (FP32_" << prefix << " - INT8_" << prefix
            << ") <= threshold)";
  LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6)
            << std::setprecision(4) << avg_acc_fp32;
  LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6)
            << std::setprecision(4) << avg_acc_int8;
534 535
}

536 537 538 539 540 541 542 543
void SummarizePerformance(const char *title, float sample) {
  CHECK_GT(sample, 0.0);
  auto throughput = 1000.0 / sample;
  LOG(INFO) << title << ": avg fps: " << std::fixed << std::setw(6)
            << std::setprecision(4) << throughput << ", avg latency: " << sample
            << " ms";
}

544 545
void SummarizePerformance(float sample_latency_fp32,
                          float sample_latency_int8) {
546 547
  if (FLAGS_enable_fp32) SummarizePerformance("FP32", sample_latency_fp32);
  if (FLAGS_enable_int8) SummarizePerformance("INT8", sample_latency_int8);
548 549
}

550 551
float CompareAccuracyOne(
    const std::vector<std::vector<PaddleTensor>> &output_slots,
552
    int compared_idx) {
553 554 555 556
  PADDLE_ENFORCE_GT(output_slots.size(), 0,
                    platform::errors::InvalidArgument(
                        "The accuracy vector is empty. The accuracy vector "
                        "size should be bigger than 0"));
557

558 559 560 561 562 563 564
  float total_accs{0};

  for (size_t i = 0; i < output_slots.size(); ++i) {
    switch (compared_idx) {
      case 1:
        PADDLE_ENFORCE_GE(
            output_slots[i].size(), 2UL,
565 566 567 568
            platform::errors::InvalidArgument(
                "To achieve top 1 accuracy, output_slots size "
                "must be bigger than or equal to 2, but now the size is %d",
                output_slots[i].size()));
569 570 571
        break;
      case 2:
        PADDLE_ENFORCE_GE(
572 573 574 575 576 577
            output_slots[i].size(), 3UL,
            platform::errors::InvalidArgument(
                "To achieve top 5 accuracy or mean Average "
                "Precision (mAP), output_slots size must be "
                "bigger than or equal to 3, but now the size is %d",
                output_slots[i].size()));
578 579 580 581
        break;
      default:
        throw std::invalid_argument(
            "CompareAccuracy: compared_idx is out of range.");
582 583
    }

584
    if (output_slots[i][compared_idx].lod.size() > 0)
585
      throw std::invalid_argument("CompareAccuracy: output has nonempty LoD.");
586 587

    if (output_slots[i][compared_idx].dtype != paddle::PaddleDType::FLOAT32)
588
      throw std::invalid_argument(
589
          "CompareAccuracy: output is of a wrong type.");
590 591 592

    total_accs +=
        *static_cast<float *>(output_slots[i][compared_idx].data.data());
593
  }
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614

  return total_accs / output_slots.size();
}

void CompareAccuracy(
    const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
    const std::vector<std::vector<PaddleTensor>> &output_slots_ref,
    int compared_idx) {
  if ((FLAGS_enable_fp32 && FLAGS_enable_int8) &&
      (output_slots_quant.size() == 0 || output_slots_ref.size()) == 0)
    throw std::invalid_argument(
        "CompareAccuracy: output_slots vector is empty.");

  float avg_acc_quant = 0.0;
  float avg_acc_ref = 0.0;

  if (FLAGS_enable_int8)
    avg_acc_quant = CompareAccuracyOne(output_slots_quant, compared_idx);

  if (FLAGS_enable_fp32)
    avg_acc_ref = CompareAccuracyOne(output_slots_ref, compared_idx);
615

616
  SummarizeAccuracy(avg_acc_ref, avg_acc_quant, compared_idx);
617 618 619 620 621 622 623

  if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0);

  if (FLAGS_enable_int8) CHECK_GT(avg_acc_quant, 0.0);

  if (FLAGS_enable_fp32 && FLAGS_enable_int8)
    CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy);
624 625
}

L
luotao1 已提交
626 627 628 629 630 631 632 633 634
void CompareDeterministic(
    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  auto predictor = CreateTestPredictor(config, FLAGS_use_analysis);

  std::vector<PaddleTensor> warmup_outputs, outputs;
  // run num_times to Compare Deterministic Result.
635 636 637 638
  for (size_t j = 0; j < inputs.size(); j++) {
    // warmup run
    predictor->Run(inputs[j], &warmup_outputs, batch_size);
    for (int i = 0; i < num_times; i++) {
L
luotao1 已提交
639 640 641 642 643 644
      predictor->Run(inputs[j], &outputs, batch_size);
      CompareResult(outputs, warmup_outputs);
    }
  }
}

T
Tao Luo 已提交
645
void CompareNativeAndAnalysis(
646
    const PaddlePredictor::Config *config,
647
    const std::vector<std::vector<PaddleTensor>> &inputs) {
648
  PrintConfig(config, true);
649
  std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
650
  TestOneThreadPrediction(config, inputs, &native_outputs, false);
T
Tao Luo 已提交
651
  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
652 653 654 655 656 657 658 659
  PADDLE_ENFORCE_GT(native_outputs.size(), 0,
                    platform::errors::InvalidArgument(
                        "The native outputs is empty. The native outputs "
                        "vector size must be bigger than 0"));
  PADDLE_ENFORCE_GT(analysis_outputs.size(), 0,
                    platform::errors::InvalidArgument(
                        "The analysis outputs is empty. The analysis outputs "
                        "vector size must be bigger than 0"));
660
  CompareResult(analysis_outputs.back(), native_outputs.back());
T
Tao Luo 已提交
661 662
}

663
void CompareQuantizedAndAnalysis(
664
    const AnalysisConfig *config, const AnalysisConfig *qconfig,
665 666
    const std::vector<std::vector<PaddleTensor>> &inputs,
    const int compared_idx = 1) {
667 668 669 670 671 672
  PADDLE_ENFORCE_EQ(
      inputs[0][0].shape[0], FLAGS_batch_size,
      platform::errors::InvalidArgument(
          "Input data has to be packed batch by batch. The batchsize is set to "
          "%d, but the real input is packed with batchsize = %d",
          FLAGS_batch_size, inputs[0][0].shape[0]));
673 674 675 676 677 678 679
  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";

  LOG(INFO) << "--- FP32 prediction start ---";
  auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
  PrintConfig(cfg, true);
  std::vector<std::vector<PaddleTensor>> analysis_outputs;
680
  float sample_latency_fp32{-1};
681 682 683 684 685

  if (FLAGS_enable_fp32) {
    TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32,
                            &sample_latency_fp32);
  }
686 687 688 689 690

  LOG(INFO) << "--- INT8 prediction start ---";
  auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
  PrintConfig(qcfg, true);
  std::vector<std::vector<PaddleTensor>> quantized_outputs;
691
  float sample_latency_int8{-1};
692

693 694 695 696
  if (FLAGS_enable_int8) {
    TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
                            VarType::INT8, &sample_latency_int8);
  }
697
  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
698

699
  CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
700 701
}

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
void CompareAnalysisAndAnalysis(
    const AnalysisConfig *config1, const AnalysisConfig *config2,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    const bool with_accuracy_layer = FLAGS_with_accuracy_layer,
    const int compared_idx = 1) {
  PADDLE_ENFORCE_EQ(
      inputs[0][0].shape[0], FLAGS_batch_size,
      platform::errors::InvalidArgument(
          "Input data has to be packed batch by batch. The batchsize is set to "
          "%d, but the real input is packed with batchsize = %d",
          FLAGS_batch_size, inputs[0][0].shape[0]));

  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";

  LOG(INFO) << "--- FP32 prediction start ---";
  auto *cfg1 = reinterpret_cast<const PaddlePredictor::Config *>(config1);
  PrintConfig(cfg1, true);
  std::vector<std::vector<PaddleTensor>> analysis_outputs;
  float sample_latency_fp32{-1};

  if (FLAGS_enable_fp32) {
    TestOneThreadPrediction(cfg1, inputs, &analysis_outputs, true,
                            VarType::FP32, &sample_latency_fp32);
  }

  LOG(INFO) << "--- INT8 prediction start ---";
  auto *cfg2 = reinterpret_cast<const PaddlePredictor::Config *>(config2);
  PrintConfig(cfg2, true);
  std::vector<std::vector<PaddleTensor>> int8_outputs;
  float sample_latency_int8{-1};

  if (FLAGS_enable_int8) {
    TestOneThreadPrediction(cfg2, inputs, &int8_outputs, true, VarType::INT8,
                            &sample_latency_int8);
  }
  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
  if (with_accuracy_layer) {
    CompareAccuracy(int8_outputs, analysis_outputs, compared_idx);
  }
}

N
nhzlx 已提交
744 745 746 747 748 749 750 751 752 753
void CompareNativeAndAnalysis(
    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
    const std::vector<std::vector<PaddleTensor>> &inputs) {
  int batch_size = FLAGS_batch_size;
  std::vector<PaddleTensor> native_outputs, analysis_outputs;
  native_pred->Run(inputs[0], &native_outputs, batch_size);
  analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
  CompareResult(analysis_outputs, native_outputs);
}

754
void CompareAnalysisAndZeroCopy(
755
    PaddlePredictor::Config *config, PaddlePredictor::Config *config1,
756 757 758 759 760 761 762 763 764
    const std::vector<std::vector<PaddleTensor>> &inputs,
    const std::vector<std::string> &outputs_name) {
  int batch_size = FLAGS_batch_size;
  // analysis
  std::vector<PaddleTensor> analysis_outputs;
  auto predictor = CreateTestPredictor(config, true);
  predictor->Run(inputs[0], &analysis_outputs, batch_size);
  // analysis + zero_copy
  std::vector<ZeroCopyTensor> zerocopy_outputs;
765 766
  reinterpret_cast<AnalysisConfig *>(config1)->SwitchUseFeedFetchOps(false);
  predictor = CreateTestPredictor(config1, true);
767 768 769 770 771 772
  ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
  predictor->ZeroCopyRun();
  for (size_t i = 0; i < outputs_name.size(); i++) {
    ZeroCopyTensor zerocopy_output =
        *predictor->GetOutputTensor(outputs_name[i]).get();
    zerocopy_outputs.emplace_back(zerocopy_output);
L
luotao1 已提交
773
    LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output);
774 775 776 777 778
  }
  // compare
  CompareResult(analysis_outputs, zerocopy_outputs);
}

779 780 781 782 783 784 785
void SaveOptimModel(AnalysisConfig *cfg, const std::string &dstPath) {
  auto predictor = CreateTestPredictor(
      reinterpret_cast<const PaddlePredictor::Config *>(cfg),
      FLAGS_use_analysis);
  (static_cast<AnalysisPredictor *>(predictor.get()))->SaveOptimModel(dstPath);
}

L
luotao1 已提交
786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
  std::stringstream ss;
  ss << "\n---- tensor ---" << '\n';
  ss << "lod: [";
  for (const auto &level : tensor.lod()) {
    ss << "[ ";
    for (auto i : level) {
      ss << i << ", ";
    }
    ss << "]";
  }
  ss << "]\n";

  ss << "shape: [";
  int size = 1;
  for (int i = 0; i < tensor.dims().size(); i++) {
    int dim = tensor.dims()[i];
    ss << dim << ", ";
    size *= dim;
  }
  ss << "]\n";

  ss << "data: ";
  for (int i = 0; i < std::min(20, size); i++) {
    ss << tensor.data<T>()[i] << " ";
  }
  ss << "\n";

  return ss.str();
}

static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
  if (a.size() != b.size()) {
    LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
                                  b.size());
    return false;
  }
  for (size_t i = 0; i < a.size(); i++) {
    auto &al = a[i];
    auto &bl = b[i];
    if (al.size() != bl.size()) {
      LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
                                    bl.size());
      return false;
    }
  }
  return true;
}

static bool CompareShape(const std::vector<int64_t> &a,
                         const std::vector<int64_t> &b) {
  if (a.size() != b.size()) {
    LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
                                  b.size());
    return false;
  }
  for (size_t i = 0; i < a.size(); i++) {
    if (a[i] != b[i]) {
      LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
                                    a[i], b[i]);
      return false;
    }
  }
  return true;
}

static bool CompareTensorData(const framework::LoDTensor &a,
                              const framework::LoDTensor &b) {
  auto a_shape = framework::vectorize(a.dims());
  auto b_shape = framework::vectorize(b.dims());
857
  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1},
L
luotao1 已提交
858
                                  [](int a, int b) { return a * b; });
859
  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1},
L
luotao1 已提交
860 861 862 863 864 865 866
                                  [](int a, int b) { return a * b; });
  if (a_size != b_size) {
    LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
                                  a_size, b_size);
  }

  for (size_t i = 0; i < a_size; i++) {
867
    if (a.type() == VarType::FP32) {
L
luotao1 已提交
868 869 870 871 872 873 874 875
      const auto *a_data = a.data<float>();
      const auto *b_data = b.data<float>();
      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
        LOG(ERROR) << string::Sprintf(
            "tensor data %d-th element not match, %f != %f", i, a_data[i],
            b_data[i]);
        return false;
      }
876
    } else if (a.type() == VarType::INT64) {
L
luotao1 已提交
877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
      const auto *a_data = a.data<int64_t>();
      const auto *b_data = b.data<int64_t>();
      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
        LOG(ERROR) << string::Sprintf(
            "tensor data %d-th element not match, %f != %f", i, a_data[i],
            b_data[i]);
        return false;
      }
    }
  }

  return true;
}

static bool CompareTensor(const framework::LoDTensor &a,
                          const framework::LoDTensor &b) {
  if (!CompareLoD(a.lod(), b.lod())) {
    return false;
  }
  if (!CompareShape(framework::vectorize(a.dims()),
                    framework::vectorize(b.dims()))) {
    return false;
  }

  if (!CompareTensorData(a, b)) {
    return false;
  }

  return true;
}

L
luotao1 已提交
908 909
}  // namespace inference
}  // namespace paddle