[cherry-pick] [CI] Enable CI for Huawei kirin NPU, Rockchip NPU and MediaTek...

[cherry-pick] [CI] Enable CI for Huawei kirin NPU, Rockchip NPU and MediaTek APU #4408 , add models uts #4277, update bert, ernie unittests #4357 (#4450)

[cherry-pick] [CI] Enable CI for Huawei kirin NPU, Rockchip NPU and MediaTek...
[cherry-pick] [CI] Enable CI for Huawei kirin NPU, Rockchip NPU and MediaTek APU #4408 , add models uts #4277, update bert, ernie unittests #4357 (#4450)
7f482be3 · hong19860320 · GitHub · aaba6447 · 7f482be3 · 7f482be3
25 changed file
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -38,25 +38,30 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT LITE_ON_TINY_PUBLISH)
 endif()
 if (WITH_TESTING)
+    set(LITE_URL_FOR_UNITTESTS "http://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests")
+    # models
    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
-    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
-    endif()
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "mobilenet_v1_int8_for_mediatek_apu.tar.gz")
-    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "mobilenet_v1_int8_for_rockchip_npu.tar.gz")
+    else()
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "resnet50.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ernie.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "GoogLeNet.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz")
    endif()
+    # data
+    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz")
+    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert_data.tar.gz")
 endif()
 # ----------------------------- PUBLISH -----------------------------

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${apu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
-if(LITE_WITH_ARM)
+function(lite_cc_test_with_model_and_data TARGET)
-    lite_cc_test(test_transformer_with_mask_fp32_arm SRCS test_transformer_with_mask_fp32_arm.cc
+    if(NOT WITH_TESTING)
+        return()
+    endif()
+    set(options "")
+    set(oneValueArgs MODEL DATA CONFIG ARGS)
+    set(multiValueArgs "")
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(ARGS "")
+    if(DEFINED args_MODEL)
+        set(ARGS "${ARGS} --model_dir=${LITE_MODEL_DIR}/${args_MODEL}")
+    endif()
+    if(DEFINED args_DATA)
+        set(ARGS "${ARGS} --data_dir=${LITE_MODEL_DIR}/${args_DATA}")
+    endif()
+    if(DEFINED args_CONFIG)
+        set(ARGS "${ARGS} --config_dir=${LITE_MODEL_DIR}/${args_CONFIG}")
+    endif()
+    if(DEFINED args_ARGS)
+        set(ARGS "${ARGS} ${args_ARGS}")
+    endif()
+    lite_cc_test(${TARGET} SRCS ${TARGET}.cc
        DEPS ${lite_model_test_DEPS} paddle_api_full
        ARM_DEPS ${arm_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL)
+        X86_DEPS ${x86_kernels}
-  if(WITH_TESTING)
+        NPU_DEPS ${npu_kernels} ${npu_bridges}
-      add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz)
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} ${huawei_ascend_npu_bridges}
+        XPU_DEPS ${xpu_kernels} ${xpu_bridges}
+        APU_DEPS ${apu_kernels} ${apu_bridges}
+        RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
+        BM_DEPS ${bm_kernels} ${bm_bridges}
+        MLU_DEPS ${mlu_kernels} ${mlu_bridges}
+        ARGS ${ARGS} SERIAL)
+    if(DEFINED args_MODEL)
+        add_dependencies(${TARGET} extern_lite_download_${args_MODEL}_tar_gz)
    endif()
+    if(DEFINED args_DATA)
+        add_dependencies(${TARGET} extern_lite_download_${args_DATA}_tar_gz)
+    endif()
+    if(DEFINED args_CONFIG)
+        add_dependencies(${TARGET} extern_lite_download_${args_CONFIG}_tar_gz)
+    endif()
+endfunction()
+if(LITE_WITH_ARM)
+    lite_cc_test_with_model_and_data(test_transformer_with_mask_fp32_arm MODEL transformer_with_mask_fp32 ARGS)
+endif()
+if(LITE_WITH_NPU)
+    lite_cc_test_with_model_and_data(test_mobilenetv1_fp32_huawei_kirin_npu MODEL mobilenet_v1 DATA ILSVRC2012_small)
+    lite_cc_test_with_model_and_data(test_mobilenetv2_fp32_huawei_kirin_npu MODEL mobilenet_v2_relu DATA ILSVRC2012_small)
+    lite_cc_test_with_model_and_data(test_resnet50_fp32_huawei_kirin_npu MODEL resnet50 DATA ILSVRC2012_small)
 endif()
 if(LITE_WITH_XPU AND NOT LITE_WITH_XTCL)
-    lite_cc_test(test_resnet50_fp32_xpu SRCS test_resnet50_fp32_xpu.cc
+    lite_cc_test_with_model_and_data(test_resnet50_fp32_xpu MODEL resnet50 DATA ILSVRC2012_small)
-      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+    lite_cc_test_with_model_and_data(test_googlenet_fp32_xpu MODEL GoogLeNet DATA ILSVRC2012_small)
-      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+    lite_cc_test_with_model_and_data(test_vgg19_fp32_xpu MODEL VGG19 DATA ILSVRC2012_small)
-      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test_with_model_and_data(test_ernie_fp32_xpu MODEL ernie DATA bert_data)
-    lite_cc_test(test_ernie_fp32_xpu SRCS test_ernie_fp32_xpu.cc
+    lite_cc_test_with_model_and_data(test_bert_fp32_xpu MODEL bert DATA bert_data)
-      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/ernie)
-    lite_cc_test(test_bert_fp32_xpu SRCS test_bert_fp32_xpu.cc
-      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/bert)
-    if(WITH_TESTING)
-        add_dependencies(test_resnet50_fp32_xpu extern_lite_download_resnet50_tar_gz)
-        add_dependencies(test_ernie_fp32_xpu extern_lite_download_ernie_tar_gz)
-        add_dependencies(test_bert_fp32_xpu extern_lite_download_bert_tar_gz)
-    endif()
-    # TODO(miaotianxiang): enable later
-    #lite_cc_test(test_fpr_fp32_xpu SRCS test_fpr_fp32_xpu.cc
-      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
-    #lite_cc_test(test_mmdnn_fp32_xpu SRCS test_mmdnn_fp32_xpu.cc
-      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
 endif()
 if(LITE_WITH_RKNPU)
-    lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc
+    lite_cc_test_with_model_and_data(test_mobilenetv1_int8_rockchip_npu MODEL mobilenet_v1_int8_for_rockchip_npu DATA ILSVRC2012_small)
-      DEPS ${lite_model_test_DEPS} paddle_api_full
-      RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
-      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
 endif()
 if(LITE_WITH_APU)
-    lite_cc_test(test_mobilenetv1_int8_apu SRCS test_mobilenetv1_int8_apu.cc
+    lite_cc_test_with_model_and_data(test_mobilenetv1_int8_mediatek_apu MODEL mobilenet_v1_int8_for_mediatek_apu DATA ILSVRC2012_small)
-      DEPS ${lite_model_test_DEPS} paddle_api_full
-      APU_DEPS ${apu_kernels} ${apu_bridges}
-      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
 endif()
--- a/lite/tests/api/ILSVRC2012_utility.h
+++ b/lite/tests/api/ILSVRC2012_utility.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+template <class T = float>
+std::vector<std::vector<T>> ReadRawData(
+    const std::string& raw_data_dir,
+    const std::vector<int>& input_shape = {1, 3, 224, 224},
+    int iteration = 100) {
+  std::vector<std::vector<T>> raw_data;
+  int image_size = 1;
+  for (size_t i = 1; i < input_shape.size(); i++) {
+    image_size *= input_shape[i];
+  }
+  int input_size = image_size * input_shape[0];
+  for (int i = 0; i < iteration; i++) {
+    std::vector<T> one_iter_raw_data;
+    one_iter_raw_data.resize(input_size);
+    T* data = &(one_iter_raw_data.at(0));
+    for (int j = 0; j < input_shape[0]; j++) {
+      std::string raw_data_file_dir =
+          raw_data_dir + std::string("/") +
+          std::to_string(i * input_shape[0] + j + 1);
+      std::ifstream fin(raw_data_file_dir, std::ios::in | std::ios::binary);
+      CHECK(fin.is_open()) << "failed to open file " << raw_data_file_dir;
+      fin.seekg(0, std::ios::end);
+      int file_size = fin.tellg();
+      fin.seekg(0, std::ios::beg);
+      CHECK_EQ(file_size, image_size * sizeof(T) / sizeof(char));
+      fin.read(reinterpret_cast<char*>(data), file_size);
+      fin.close();
+      data += image_size;
+    }
+    raw_data.emplace_back(one_iter_raw_data);
+  }
+  return raw_data;
+}
+float CalOutAccuracy(const std::vector<std::vector<float>>& out_rets,
+                     const std::string& labels_dir) {
+  std::vector<int> labels;
+  std::vector<int> out_top1;
+  int right_num = 0;
+  auto label_lines = ReadLines(labels_dir);
+  for (size_t i = 0; i < out_rets.size(); i++) {
+    int label = std::stoi(Split(label_lines[i], " ")[1]);
+    auto out = out_rets[i];
+    auto largest = std::max_element(out.begin(), out.end());
+    int out_top1 = std::distance(out.begin(), largest);
+    right_num += (out_top1 == label);
+  }
+  return static_cast<float>(right_num) / static_cast<float>(out_rets.size());
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/bert_utility.h
+++ b/lite/tests/api/bert_utility.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+template <class T = int64_t>
+void ReadRawData(const std::string& input_data_dir,
+                 std::vector<std::vector<T>>* input0,
+                 std::vector<std::vector<T>>* input1,
+                 std::vector<std::vector<T>>* input2,
+                 std::vector<std::vector<T>>* input3,
+                 std::vector<std::vector<int64_t>>* input_shapes) {
+  auto lines = ReadLines(input_data_dir);
+  for (auto line : lines) {
+    std::vector<std::string> shape_and_data = Split(line, ";");
+    std::vector<int64_t> input_shape =
+        Split<int64_t>(Split(shape_and_data[0], ":")[0], " ");
+    input_shapes->emplace_back(input_shape);
+    std::vector<T> input0_data =
+        Split<T>(Split(shape_and_data[0], ":")[1], " ");
+    input0->emplace_back(input0_data);
+    std::vector<T> input1_data =
+        Split<T>(Split(shape_and_data[1], ":")[1], " ");
+    input1->emplace_back(input1_data);
+    std::vector<T> input2_data =
+        Split<T>(Split(shape_and_data[2], ":")[1], " ");
+    input2->emplace_back(input2_data);
+    std::vector<T> input3_data =
+        Split<T>(Split(shape_and_data[3], ":")[1], " ");
+    input3->emplace_back(input3_data);
+  }
+}
+template <class T = int64_t>
+void FillTensor(const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
+                int tensor_id,
+                const std::vector<int64_t>& tensor_shape,
+                const std::vector<T>& tensor_value) {
+  predictor->GetInput(tensor_id)->Resize(tensor_shape);
+  int64_t tensor_size = 1;
+  for (size_t i = 0; i < tensor_shape.size(); i++) {
+    tensor_size *= tensor_shape[i];
+  }
+  CHECK_EQ(static_cast<size_t>(tensor_size), tensor_value.size());
+  memcpy(predictor->GetInput(tensor_id)->mutable_data<T>(),
+         tensor_value.data(),
+         sizeof(T) * tensor_size);
+}
+float CalBertOutAccuracy(const std::vector<std::vector<float>>& out,
+                         const std::string& out_file) {
+  auto lines = ReadLines(out_file);
+  std::vector<std::vector<float>> ref_out;
+  for (auto line : lines) {
+    ref_out.emplace_back(Split<float>(line, " "));
+  }
+  int right_num = 0;
+  for (size_t i = 0; i < out.size(); i++) {
+    std::vector<size_t> out_index{0, 1, 2};
+    std::vector<size_t> ref_out_index{0, 1, 2};
+    std::sort(out_index.begin(),
+              out_index.end(),
+              [&out, i](size_t a, size_t b) { return out[i][a] > out[i][b]; });
+    std::sort(ref_out_index.begin(),
+              ref_out_index.end(),
+              [&ref_out, i](size_t a, size_t b) {
+                return ref_out[i][a] > ref_out[i][b];
+              });
+    right_num += (out_index == ref_out_index);
+  }
+  return static_cast<float>(right_num) / static_cast<float>(out.size());
+}
+float CalErnieOutAccuracy(const std::vector<std::vector<float>>& out,
+                          const std::string& out_file) {
+  auto lines = ReadLines(out_file);
+  std::vector<std::vector<float>> ref_out;
+  for (auto line : lines) {
+    ref_out.emplace_back(Split<float>(line, " "));
+  }
+  int right_num = 0;
+  for (size_t i = 0; i < out.size(); i++) {
+    right_num += (std::fabs(out[i][0] - ref_out[i][0]) < 0.01f);
+  }
+  return static_cast<float>(right_num) / static_cast<float>(out.size());
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_bert_fp32_xpu.cc
+++ b/lite/tests/api/test_bert_fp32_xpu.cc
@@ -21,23 +21,16 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/bert_utility.h"
 #include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 9, "iteration times to run");
 namespace paddle {
 namespace lite {
-template <typename T>
+TEST(Bert, test_bert_fp32_xpu) {
-lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
-  lite::Tensor ret;
-  ret.Resize(shape);
-  T* ptr = ret.mutable_data<T>();
-  for (int i = 0; i < ret.numel(); ++i) {
-    ptr[i] = (T)1;
-  }
-  return ret;
-}
-TEST(Ernie, test_ernie_fp32_xpu) {
  lite_api::CxxConfig config;
  config.set_model_dir(FLAGS_model_dir);
  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
@@ -46,56 +39,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
  config.set_xpu_workspace_l3_size_per_thread();
  auto predictor = lite_api::CreatePaddlePredictor(config);
-  int64_t batch_size = 1;
+  std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
-  int64_t seq_len = 64;
+  std::vector<std::vector<int64_t>> input0;
-  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<std::vector<int64_t>> input1;
-  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  std::vector<std::vector<int64_t>> input2;
-  predictor->GetInput(0)->Resize(input_shape);
+  std::vector<std::vector<int64_t>> input3;
-  predictor->GetInput(1)->Resize(input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
-  predictor->GetInput(2)->Resize(input_shape);
+  ReadRawData(
-  predictor->GetInput(3)->Resize(input_shape);
+      input_data_file, &input0, &input1, &input2, &input3, &input_shapes);
-  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
  for (int i = 0; i < FLAGS_warmup; ++i) {
+    std::vector<int64_t> shape = {1, 64, 1};
+    std::vector<int64_t> fill_value(64, 0);
+    for (int j = 0; j < 4; j++) {
+      FillTensor(predictor, j, shape, fill_value);
+    }
    predictor->Run();
  }
-  auto start = GetCurrentUS();
+  std::vector<std::vector<float>> out_rets;
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (int i = 0; i < FLAGS_iteration; ++i) {
+    FillTensor(predictor, 0, input_shapes[i], input0[i]);
+    FillTensor(predictor, 1, input_shapes[i], input1[i]);
+    FillTensor(predictor, 2, input_shapes[i], input2[i]);
+    FillTensor(predictor, 3, input_shapes[i], input3[i]);
+    double start = GetCurrentUS();
    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 3);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
  }
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", warmup: " << FLAGS_warmup
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << ", iteration: " << FLAGS_iteration << ", spend "
-            << " ms in average.";
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
-  std::vector<std::vector<float>> results;
+  std::string ref_out_file = FLAGS_data_dir + std::string("/bert_out.txt");
-  results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
+  float out_accuracy = CalBertOutAccuracy(out_rets, ref_out_file);
-  auto out = predictor->GetOutput(0);
+  ASSERT_GT(out_accuracy, 0.95f);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 3);
-  for (size_t i = 0; i < results.size(); ++i) {
-    for (size_t j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 3e-5);
-    }
-  }
 }
 }  // namespace lite

--- a/lite/tests/api/test_ernie_fp32_xpu.cc
+++ b/lite/tests/api/test_ernie_fp32_xpu.cc
@@ -21,8 +21,12 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/bert_utility.h"
 #include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 9, "iteration times to run");
 namespace paddle {
 namespace lite {
@@ -46,56 +50,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
  config.set_xpu_workspace_l3_size_per_thread();
  auto predictor = lite_api::CreatePaddlePredictor(config);
-  int64_t batch_size = 1;
+  std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
-  int64_t seq_len = 64;
+  std::vector<std::vector<int64_t>> input0;
-  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<std::vector<int64_t>> input1;
-  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  std::vector<std::vector<int64_t>> input2;
-  predictor->GetInput(0)->Resize(input_shape);
+  std::vector<std::vector<int64_t>> input3;
-  predictor->GetInput(1)->Resize(input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
-  predictor->GetInput(2)->Resize(input_shape);
+  ReadRawData(
-  predictor->GetInput(3)->Resize(input_shape);
+      input_data_file, &input0, &input1, &input2, &input3, &input_shapes);
-  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
  for (int i = 0; i < FLAGS_warmup; ++i) {
+    std::vector<int64_t> shape = {1, 64, 1};
+    std::vector<int64_t> fill_value(64, 0);
+    for (int j = 0; j < 4; j++) {
+      FillTensor(predictor, j, shape, fill_value);
+    }
    predictor->Run();
  }
-  auto start = GetCurrentUS();
+  std::vector<std::vector<float>> out_rets;
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (int i = 0; i < FLAGS_iteration; ++i) {
+    FillTensor(predictor, 0, input_shapes[i], input0[i]);
+    FillTensor(predictor, 1, input_shapes[i], input1[i]);
+    FillTensor(predictor, 2, input_shapes[i], input2[i]);
+    FillTensor(predictor, 3, input_shapes[i], input3[i]);
+    double start = GetCurrentUS();
    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
  }
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", warmup: " << FLAGS_warmup
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << ", iteration: " << FLAGS_iteration << ", spend "
-            << " ms in average.";
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>({0.108398}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 1);
-  for (size_t i = 0; i < results.size(); ++i) {
+  std::string ref_out_file = FLAGS_data_dir + std::string("/ernie_out.txt");
-    for (size_t j = 0; j < results[i].size(); ++j) {
+  float out_accuracy = CalErnieOutAccuracy(out_rets, ref_out_file);
-      EXPECT_NEAR(
+  ASSERT_GT(out_accuracy, 0.95f);
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 2e-5);
-    }
-  }
 }
 }  // namespace lite

--- a/lite/tests/api/test_googlenet_fp32_xpu.cc
+++ b/lite/tests/api/test_googlenet_fp32_xpu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(GoogLeNet, test_googlenet_fp32_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GT(out_accuracy, 0.57f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_mobilenetv1_fp32_huawei_kirin_npu.cc
+++ b/lite/tests/api/test_mobilenetv1_fp32_huawei_kirin_npu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(MobileNetV1, test_mobilenetv1_fp32_huawei_kirin_npu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}});
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GE(out_accuracy, 0.57f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_mobilenetv1_int8_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_apu.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <fstream>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-using namespace paddle::lite_api;  // NOLINT
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-inline int64_t ShapeProduction(std::vector<int64_t> shape) {
-  int64_t s = 1;
-  for (int64_t dim : shape) {
-    s *= dim;
-  }
-  return s;
-}
-int main(int argc, char** argv) {
-  if (argc < 2) {
-    std::cerr << "[ERROR] usage: ./" << argv[0]
-              << " model_dir [thread_num] [warmup_times] [repeat_times] "
-                 "[input_data_path] [output_data_path]"
-              << std::endl;
-    return -1;
-  }
-  std::string model_dir = argv[1];
-  int thread_num = 1;
-  if (argc > 2) {
-    thread_num = atoi(argv[2]);
-  }
-  int warmup_times = 5;
-  if (argc > 3) {
-    warmup_times = atoi(argv[3]);
-  }
-  int repeat_times = 10;
-  if (argc > 4) {
-    repeat_times = atoi(argv[4]);
-  }
-  std::string input_data_path;
-  if (argc > 5) {
-    input_data_path = argv[5];
-  }
-  std::string output_data_path;
-  if (argc > 6) {
-    output_data_path = argv[6];
-  }
-  paddle::lite_api::CxxConfig config;
-  config.set_model_dir(model_dir);
-  config.set_threads(thread_num);
-  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
-  config.set_valid_places(
-      {paddle::lite_api::Place{
-           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-       paddle::lite_api::Place{
-           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
-       paddle::lite_api::Place{
-           TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
-  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
-  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
-      std::move(predictor->GetInput(0)));
-  input_tensor->Resize({1, 3, 224, 224});
-  auto input_data = input_tensor->mutable_data<float>();
-  auto input_size = ShapeProduction(input_tensor->shape());
-  // test loop
-  int total_imgs = 500;
-  float test_num = 0;
-  float top1_num = 0;
-  float top5_num = 0;
-  int output_len = 1000;
-  std::vector<int> index(1000);
-  bool debug = true;  // false;
-  int show_step = 500;
-  for (int i = 0; i < total_imgs; i++) {
-    // set input
-    std::string filename = input_data_path + "/" + std::to_string(i);
-    std::ifstream fs(filename, std::ifstream::binary);
-    if (!fs.is_open()) {
-      std::cout << "open input file fail.";
-    }
-    auto input_data_tmp = input_data;
-    for (int i = 0; i < input_size; ++i) {
-      fs.read(reinterpret_cast<char*>(input_data_tmp), sizeof(*input_data_tmp));
-      input_data_tmp++;
-    }
-    int label = 0;
-    fs.read(reinterpret_cast<char*>(&label), sizeof(label));
-    fs.close();
-    if (debug && i % show_step == 0) {
-      std::cout << "input data:" << std::endl;
-      std::cout << input_data[0] << " " << input_data[10] << " "
-                << input_data[input_size - 1] << std::endl;
-      std::cout << "label:" << label << std::endl;
-    }
-    // run
-    predictor->Run();
-    auto output0 = predictor->GetOutput(0);
-    auto output0_data = output0->data<float>();
-    // get output
-    std::iota(index.begin(), index.end(), 0);
-    std::stable_sort(
-        index.begin(), index.end(), [output0_data](size_t i1, size_t i2) {
-          return output0_data[i1] > output0_data[i2];
-        });
-    test_num++;
-    if (label == index[0]) {
-      top1_num++;
-    }
-    for (int i = 0; i < 5; i++) {
-      if (label == index[i]) {
-        top5_num++;
-      }
-    }
-    if (debug && i % show_step == 0) {
-      std::cout << index[0] << " " << index[1] << " " << index[2] << " "
-                << index[3] << " " << index[4] << std::endl;
-      std::cout << output0_data[index[0]] << " " << output0_data[index[1]]
-                << " " << output0_data[index[2]] << " "
-                << output0_data[index[3]] << " " << output0_data[index[4]]
-                << std::endl;
-      std::cout << output0_data[630] << std::endl;
-    }
-    if (i % show_step == 0) {
-      std::cout << "step " << i << "; top1 acc:" << top1_num / test_num
-                << "; top5 acc:" << top5_num / test_num << std::endl;
-    }
-  }
-  std::cout << "final result:" << std::endl;
-  std::cout << "top1 acc:" << top1_num / test_num << std::endl;
-  std::cout << "top5 acc:" << top5_num / test_num << std::endl;
-  return 0;
-}
--- a/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(MobileNetV1, test_mobilenetv1_int8_mediatek_apu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kARM), PRECISION(kInt8)},
+                           lite_api::Place{TARGET(kAPU), PRECISION(kInt8)}});
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GE(out_accuracy, 0.55f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <sys/time.h>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-inline int64_t ShapeProduction(std::vector<int64_t> shape) {
-  int64_t s = 1;
-  for (int64_t dim : shape) {
-    s *= dim;
-  }
-  return s;
-}
-int main(int argc, char** argv) {
-  if (argc < 2) {
-    std::cerr << "[ERROR] usage: ./" << argv[0]
-              << " model_dir [thread_num] [warmup_times] [repeat_times] "
-                 "[input_data_path] [output_data_path]"
-              << std::endl;
-    return -1;
-  }
-  std::string model_dir = argv[1];
-  int thread_num = 1;
-  if (argc > 2) {
-    thread_num = atoi(argv[2]);
-  }
-  int warmup_times = 5;
-  if (argc > 3) {
-    warmup_times = atoi(argv[3]);
-  }
-  int repeat_times = 10;
-  if (argc > 4) {
-    repeat_times = atoi(argv[4]);
-  }
-  std::string input_data_path;
-  if (argc > 5) {
-    input_data_path = argv[5];
-  }
-  std::string output_data_path;
-  if (argc > 6) {
-    output_data_path = argv[6];
-  }
-  paddle::lite_api::CxxConfig config;
-  config.set_model_dir(model_dir);
-  config.set_threads(thread_num);
-  config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
-  config.set_valid_places(
-      {paddle::lite_api::Place{
-           TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-       paddle::lite_api::Place{
-           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
-       paddle::lite_api::Place{
-           TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
-       paddle::lite_api::Place{
-           TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
-  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
-  std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
-      std::move(predictor->GetInput(0)));
-  input_tensor->Resize({1, 3, 224, 224});
-  auto input_data = input_tensor->mutable_data<float>();
-  auto input_size = ShapeProduction(input_tensor->shape());
-  if (input_data_path.empty()) {
-    for (int i = 0; i < input_size; i++) {
-      input_data[i] = 1;
-    }
-  } else {
-    std::fstream fs(input_data_path, std::ios::in);
-    if (!fs.is_open()) {
-      std::cerr << "open input data file failed." << std::endl;
-      return -1;
-    }
-    for (int i = 0; i < input_size; i++) {
-      fs >> input_data[i];
-    }
-  }
-  for (int i = 0; i < warmup_times; ++i) {
-    predictor->Run();
-  }
-  auto start = GetCurrentUS();
-  for (int i = 0; i < repeat_times; ++i) {
-    predictor->Run();
-  }
-  std::cout << "Model: " << model_dir << ", threads num " << thread_num
-            << ", warmup times: " << warmup_times
-            << ", repeat times: " << repeat_times << ", spend "
-            << (GetCurrentUS() - start) / repeat_times / 1000.0
-            << " ms in average." << std::endl;
-  std::unique_ptr<const paddle::lite_api::Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  auto output_data = output_tensor->data<float>();
-  auto output_size = ShapeProduction(output_tensor->shape());
-  std::cout << "output data:";
-  for (int i = 0; i < output_size; i += 100) {
-    std::cout << "[" << i << "] " << output_data[i] << std::endl;
-  }
-  return 0;
-}
--- a/lite/tests/api/test_mobilenetv1_int8_rockchip_npu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_rockchip_npu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(MobileNetV1, test_mobilenetv1_int8_rockchip_apu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kARM), PRECISION(kInt8)},
+                           lite_api::Place{TARGET(kRKNPU), PRECISION(kInt8)}});
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GE(out_accuracy, 0.52f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_mobilenetv2_fp32_huawei_kirin_npu.cc
+++ b/lite/tests/api/test_mobilenetv2_fp32_huawei_kirin_npu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(MobileNetV2, test_mobilenetv2_fp32_huawei_kirin_npu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}});
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GE(out_accuracy, 0.57f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_resnet50_fp32_huawei_kirin_npu.cc
+++ b/lite/tests/api/test_resnet50_fp32_huawei_kirin_npu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(ResNet50, test_resnet50_fp32_huawei_kirin_npu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}});
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GE(out_accuracy, 0.64f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_resnet50_fp32_xpu.cc
+++ b/lite/tests/api/test_resnet50_fp32_xpu.cc
@@ -21,8 +21,14 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
 #include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
 namespace paddle {
 namespace lite {
@@ -35,52 +41,62 @@ TEST(Resnet50, test_resnet50_fp32_xpu) {
  config.set_xpu_workspace_l3_size_per_thread();
  auto predictor = lite_api::CreatePaddlePredictor(config);
-  auto input_tensor = predictor->GetInput(0);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
-  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  std::vector<int> input_shape{
-  input_tensor->Resize(input_shape);
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
-  auto* data = input_tensor->mutable_data<float>();
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
-  int input_num = 1;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
+  int input_size = 1;
-    input_num *= input_shape[i];
+  for (auto i : input_shape) {
-  }
+    input_size *= i;
-  for (int i = 0; i < input_num; i++) {
-    data[i] = 1;
  }
  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
    predictor->Run();
  }
-  auto start = GetCurrentUS();
+  std::vector<std::vector<float>> out_rets;
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
  }
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << ", iteration: " << FLAGS_iteration << ", spend "
-            << " ms in average.";
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>(
-      {0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
-       0.00018169,  0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
-       0.000220664, 0.00235289,  0.00571265,  0.00357688,  0.00129667,
-       0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 1000);
-  int step = 50;
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
-  for (size_t i = 0; i < results.size(); ++i) {
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
-    for (size_t j = 0; j < results[i].size(); ++j) {
+  ASSERT_GT(out_accuracy, 0.6f);
-      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
-                  results[i][j],
-                  1e-5);
-    }
-  }
 }
 }  // namespace lite

--- a/lite/tests/api/test_vgg19_fp32_xpu.cc
+++ b/lite/tests/api/test_vgg19_fp32_xpu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+namespace paddle {
+namespace lite {
+TEST(VGG19, test_vgg19_fp32_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GT(out_accuracy, 0.56f);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -121,8 +121,8 @@ class FcOPTest : public arena::TestCase {
    int k = wdims_[0];
    int n = wdims_[1];
-    LOG(INFO) << "M=" << m << ", N=" << n << ", K=" << k
+    VLOG(4) << "M=" << m << ", N=" << n << ", K=" << k << ", bias=" << flag_bias
-              << ", bias=" << flag_bias << ", with_relu=" << with_relu_
+            << ", with_relu=" << with_relu_
            << ", padding_weights=" << padding_weights_;
    if (m == 1) {

--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -738,7 +738,7 @@ TEST(PriorBox, precision) {
 }
 TEST(DensityPriorBox, precision) {
-#ifdef LITE_WITH_X86
+#if defined(LITE_WITH_X86) && !defined(LITE_WITH_XPU)
  Place place(TARGET(kX86));
 #endif
 #ifdef LITE_WITH_ARM

--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -104,7 +104,7 @@ bool test_gemm_int8(bool tra,
    scale_merge_int8[j] = scale_merge_fp32[j] / scale_c[0];
  }
-  LOG(INFO) << "gemm_int8 M: " << m << ", N: " << n << ", K: " << k
+  VLOG(4) << "gemm_int8 M: " << m << ", N: " << n << ", K: " << k
          << ", transA: " << (tra ? "true" : "false")
          << ", transB: " << (trb ? "true" : "false")
          << ", relu: " << (has_relu ? "true" : "false")
@@ -344,8 +344,7 @@ TEST(TestLiteGemmInt8, gemm_prepacked_int8) {
                                               FLAGS_power_mode,
                                               th);
                    if (flag) {
-                      LOG(INFO) << "test m = " << m << ", n=" << n
+                      VLOG(4) << "test m = " << m << ", n=" << n << ", k=" << k
-                                << ", k=" << k
                              << ", bias: " << (has_bias ? "true" : "false")
                              << ", relu: " << (has_relu ? "true" : "false")
                              << ", trans A: " << (tra ? "true" : "false")

--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -97,7 +97,7 @@ bool test_gemv_int8(bool tra,
    scale_merge_int8[j] = scale_merge_fp32[j] / scale_c[0];
  }
-  LOG(INFO) << "gemv_int8 M: " << m << ", N: " << n
+  VLOG(4) << "gemv_int8 M: " << m << ", N: " << n
          << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
          << ", bias: " << (has_bias ? "true" : "false");
 #ifdef LITE_WITH_ARM
@@ -336,7 +336,7 @@ TEST(TestLiteGemvInt8, gemv_prepacked_int8) {
                                           six,
                                           alpha);
                if (flag) {
-                  LOG(INFO) << "test m = " << m << ", n=" << n
+                  VLOG(4) << "test m = " << m << ", n=" << n
                          << ", bias: " << (has_bias ? "true" : "false")
                          << ",  relu: " << (has_relu ? "true" : "false")
                          << ", trans A: " << (tra ? "true" : "false")

--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -98,7 +98,7 @@ bool test_sgemm_c4(
  basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
  basic_trans_mat_to_c4(db, db_c4, n, k, n, false);
-  LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
+  VLOG(4) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
          << ", relu: " << (has_relu ? "true" : "false")
          << ", bias: " << (has_bias ? "true" : "false");
@@ -331,7 +331,7 @@ TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
                auto flag = test_sgemm_c4(
                    m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
                if (flag) {
-                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
+                  VLOG(4) << "test m = " << m << ", n=" << n << ", k=" << k
                          << ", bias: " << (has_bias ? "true" : "false")
                          << ", relu: " << (has_relu ? "true" : "false")
                          << " passed\n";
@@ -364,7 +364,7 @@ TEST(TestSgemmC8, test_func_sgemm_c8_prepacked) {
                auto flag = test_sgemm_c8(
                    m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
                if (flag) {
-                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
+                  VLOG(4) << "test m = " << m << ", n=" << n << ", k=" << k
                          << ", bias: " << (has_bias ? "true" : "false")
                          << ", relu: " << (has_relu ? "true" : "false")
                          << " passed\n";

--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -75,7 +75,7 @@ bool test_sgemv(bool tra,
  // fill_tensor_const(tb, 1.f);
  fill_tensor_rand(tbias, -1.f, 1.f);
-  LOG(INFO) << "sgemv M: " << m << ", K: " << k
+  VLOG(4) << "sgemv M: " << m << ", K: " << k
          << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
          << ", bias: " << (has_bias ? "true" : "false");
 #ifdef LITE_WITH_ARM
@@ -209,7 +209,7 @@ TEST(TestLiteSgemv, Sgemv) {
                                       six,
                                       alpha);
                if (flag) {
-                  LOG(INFO) << "test m = " << m << ", k=" << k
+                  VLOG(4) << "test m = " << m << ", k=" << k
                          << ", bias: " << (has_bias ? "true" : "false")
                          << ", flag act: " << flag_act
                          << ", trans A: " << (tra ? "true" : "false")

--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh