[ARM] fix caliberate error, values from [-128,127] to [-127,127] (#2927)

* fix caliberate error, values from [-128, 127] to [-127, 127], test=develop * add classify demo and detection demo, test=develop

[ARM] fix caliberate error, values from [-128,127] to [-127,127] (#2927)
* fix caliberate error, values from [-128, 127] to [-127, 127], test=develop * add classify demo and detection demo, test=develop
c0fa7ada · HappyAngel · GitHub · b94839b0 · c0fa7ada · c0fa7ada
9 changed file
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -330,6 +330,30 @@ if(NOT IOS)
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
+    
+    lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
+    
+    lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})

    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -341,6 +365,7 @@ if(NOT IOS)
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
+    
    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
        ARM_DEPS ${arm_kernels}

--- a/lite/api/model_test_classify.cc
+++ b/lite/api/model_test_classify.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif  // LITE_WITH_PROFILE
+
+using paddle::lite::profile::Timer;
+
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+DEFINE_string(arg_name, "", "the arg name");
+
+DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
+DEFINE_string(in_txt, "", "input text");
+DEFINE_string(out_txt, "", "output text");
+DEFINE_string(label_file, "", "label file path");
+DEFINE_int32(topk, 1, "topk num");
+
+namespace paddle {
+namespace lite_api {
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+std::vector<std::string> load_labels(std::string label_path) {
+  FILE* fp = fopen(label_path.c_str(), "r");
+  if (fp == nullptr) {
+    LOG(FATAL) << "load label file failed! " << label_path;
+  }
+  std::vector<std::string> labels;
+  while (!feof(fp)) {
+    char str[1024];
+    fgets(str, 1024, fp);
+    std::string str_s(str);
+
+    if (str_s.length() > 0) {
+      for (int i = 0; i < str_s.length(); i++) {
+        if (str_s[i] == ' ') {
+          std::string strr = str_s.substr(i, str_s.length() - i - 1);
+          labels.push_back(strr);
+          i = str_s.length();
+        }
+      }
+    }
+  }
+  fclose(fp);
+  return labels;
+}
+
+void print_topk(const float* scores,
+                const int size,
+                const int topk,
+                const std::vector<std::string> labels) {
+  std::vector<std::pair<float, int>> vec;
+  vec.resize(size);
+  for (int i = 0; i < size; i++) {
+    vec[i] = std::make_pair(scores[i], i);
+  }
+  std::partial_sort(vec.begin(),
+                    vec.begin() + topk,
+                    vec.end(),
+                    std::greater<std::pair<float, int>>());
+
+  // print topk and score
+  std::string name = FLAGS_out_txt + "_accu.txt";
+  FILE* fp = fopen(name.c_str(), "w");
+  fprintf(fp, "%d \n", topk);
+  for (int i = 0; i < topk; i++) {
+    float score = vec[i].first;
+    int index = vec[i].second;
+    fprintf(fp, "%d ", index);
+    fprintf(fp, "%f \n", score);
+    LOG(INFO) << i << ": " << index << "  " << labels[index] << "  " << score;
+  }
+  fclose(fp);
+}
+
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         const int warmup_times = 0) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  bool flag_in = true;
+  bool flag_out = true;
+  if (FLAGS_in_txt == "") {
+    flag_in = false;
+  }
+  if (FLAGS_out_txt == "") {
+    flag_out = false;
+  }
+  printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+
+    FILE* fp_r = nullptr;
+    if (flag_in) {
+      fp_r = fopen(FLAGS_in_txt.c_str(), "r");
+    }
+    for (int i = 0; i < input_num; ++i) {
+      if (flag_in) {
+        fscanf(fp_r, "%f\n", &input_data[i]);
+      } else {
+        input_data[i] = 1.f;
+      }
+    }
+    if (flag_in) {
+      fclose(fp_r);
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup_times
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
+            << " ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  auto output_shape = output->shape();
+  int output_num = 1;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    output_num *= output_shape[i];
+  }
+  // classify
+  printf("load_labels \n");
+  std::vector<std::string> labels = load_labels(FLAGS_label_file);
+  printf("print_topk \n");
+  print_topk(out, output_num, FLAGS_topk, labels);
+  LOG(INFO) << "output_num: " << output_num;
+  LOG(INFO) << "out " << out[0];
+  LOG(INFO) << "out " << out[1];
+  FILE* fp = nullptr;
+  if (flag_out) {
+    fp = fopen(FLAGS_out_txt.c_str(), "w");
+  }
+  double sum1 = 0.f;
+  for (int i = 0; i < output_num; ++i) {
+    if (flag_out) {
+      fprintf(fp, "%f\n", out[i]);
+    }
+    sum1 += out[i];
+  }
+  if (flag_out) {
+    fclose(fp);
+  }
+  printf("out mean: %f \n", sum1 / output_num);
+
+  FILE* fp_w = fopen("time.txt", "a+");
+  if (!fp_w) {
+    printf("open file failed \n");
+    return;
+  }
+  fprintf(fp_w,
+          "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
+          model_dir.c_str(),
+          thread_num,
+          ti.LapTimes().Avg(),
+          ti.LapTimes().Min(),
+          ti.LapTimes().Max());
+  fclose(fp_w);
+
+  // please turn off memory_optimize_pass to use this feature.
+  if (FLAGS_arg_name != "") {
+    auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
+    auto arg_shape = arg_tensor->shape();
+    int arg_num = 1;
+    std::ostringstream os;
+    os << "{";
+    for (int i = 0; i < arg_shape.size(); ++i) {
+      arg_num *= arg_shape[i];
+      os << arg_shape[i] << ",";
+    }
+    os << "}";
+    float sum = 0.;
+    std::ofstream out(FLAGS_arg_name + ".txt");
+    for (size_t i = 0; i < arg_num; ++i) {
+      sum += arg_tensor->data<float>()[i];
+      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+    }
+    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
+              << ", mean value is " << sum * 1. / arg_num;
+  }
+}
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+  }
+
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+
+  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    LOG(INFO) << "input shape: " << str_input_shapes[i];
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+  }
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  paddle::lite_api::Run(
+      input_shapes,
+      save_optimized_model_dir,
+      static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
+      FLAGS_threads,
+      FLAGS_repeats,
+      FLAGS_warmup);
+#endif
+  return 0;
+}
--- a/lite/api/model_test_detection.cc
+++ b/lite/api/model_test_detection.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif  // LITE_WITH_PROFILE
+
+using paddle::lite::profile::Timer;
+
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+DEFINE_string(arg_name, "", "the arg name");
+
+DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
+DEFINE_string(in_txt, "", "input text");
+DEFINE_string(out_txt, "", "output text");
+DEFINE_int32(orih, 1920, "input image height");
+DEFINE_int32(oriw, 1080, "input image width");
+
+namespace paddle {
+namespace lite_api {
+
+struct Object {
+  float x;
+  float y;
+  float width;
+  float height;
+  float class_id;
+  float prob;
+};
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+void detect_choose(const float* dout,
+                   std::vector<int64_t> dims,
+                   const float thresh) {
+  std::string name = FLAGS_out_txt + "_accu.txt";
+  FILE* fp = fopen(name.c_str(), "w");
+  for (int iw = 0; iw < dims[0]; iw++) {
+    const float* values = dout + iw * dims[1];
+    if (values[1] > thresh) {  // pro > 0.01
+      fprintf(fp, "%f \n", values[0]);
+      fprintf(fp, "%f \n", values[1]);
+      fprintf(fp, "%f \n", values[2]);
+      fprintf(fp, "%f \n", values[3]);
+      fprintf(fp, "%f \n", values[4]);
+      fprintf(fp, "%f \n", values[5]);
+    }
+  }
+  fclose(fp);
+}
+void detect_object(const float* dout,
+                   std::vector<int64_t> dims,
+                   const float thresh,
+                   int orih,
+                   int oriw) {
+  std::vector<Object> objects;
+  for (int iw = 0; iw < dims[0]; iw++) {
+    Object object;
+    const float* values = dout + iw * dims[1];
+    object.class_id = values[0];
+    object.prob = values[1];
+    object.x = values[2] * oriw;
+    object.y = values[3] * orih;
+    object.width = values[4] * oriw - object.x;
+    object.height = values[5] * orih - object.y;
+    objects.push_back(object);
+  }
+  std::string name = FLAGS_out_txt + "_accu.txt";
+  FILE* fp = fopen(name.c_str(), "w");
+  for (int i = 0; i < objects.size(); ++i) {
+    Object object = objects.at(i);
+    if (object.prob > thresh && object.x > 0 && object.y > 0 &&
+        object.width > 0 && object.height > 0) {
+      if (object.x >= oriw || object.width >= oriw || object.y >= orih ||
+          object.height >= orih)
+        continue;
+      fprintf(fp, "%f \n", object.x);
+      fprintf(fp, "%f \n", object.y);
+      fprintf(fp, "%f \n", object.width);
+      fprintf(fp, "%f \n", object.height);
+      fprintf(fp, "%f \n", object.prob);
+      fprintf(fp, "%f \n", object.class_id);
+      LOG(INFO) << "object id: " << object.class_id << ", image size: " << oriw
+                << ", " << orih << ", detect object: " << object.prob
+                << ", location: x=" << object.x << ", y=" << object.y
+                << ", width=" << object.width << ", height=" << object.height;
+    }
+  }
+  fclose(fp);
+}
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         const int warmup_times = 0) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  bool flag_in = true;
+  bool flag_out = true;
+  if (FLAGS_in_txt == "") {
+    flag_in = false;
+  }
+  if (FLAGS_out_txt == "") {
+    flag_out = false;
+  }
+  printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+
+    FILE* fp_r = nullptr;
+    if (flag_in) {
+      fp_r = fopen(FLAGS_in_txt.c_str(), "r");
+    }
+    for (int i = 0; i < input_num; ++i) {
+      if (flag_in) {
+        fscanf(fp_r, "%f\n", &input_data[i]);
+      } else {
+        input_data[i] = 1.f;
+      }
+    }
+    if (flag_in) {
+      fclose(fp_r);
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup_times
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
+            << " ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  auto output_shape = output->shape();
+  // detect
+  detect_object(
+      out, output_shape, atof(FLAGS_threshold.data()), FLAGS_orih, FLAGS_oriw);
+  // detect_choose(out, output_shape, atof(FLAGS_threshold.data()));
+  LOG(INFO) << "out " << out[0];
+  LOG(INFO) << "out " << out[1];
+  int output_num = 1;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    output_num *= output_shape[i];
+  }
+  LOG(INFO) << "output_num: " << output_num;
+  FILE* fp = nullptr;
+  if (flag_out) {
+    fp = fopen(FLAGS_out_txt.c_str(), "w");
+  }
+  double sum1 = 0.f;
+  for (int i = 0; i < output_num; ++i) {
+    if (flag_out) {
+      fprintf(fp, "%f\n", out[i]);
+    }
+    sum1 += out[i];
+  }
+  if (flag_out) {
+    fclose(fp);
+  }
+
+  printf("out mean: %f \n", sum1 / output_num);
+
+  FILE* fp_w = fopen("time.txt", "a+");
+  if (!fp_w) {
+    printf("open file failed \n");
+    return;
+  }
+  fprintf(fp_w,
+          "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
+          model_dir.c_str(),
+          thread_num,
+          ti.LapTimes().Avg(),
+          ti.LapTimes().Min(),
+          ti.LapTimes().Max());
+  fclose(fp_w);
+
+  // please turn off memory_optimize_pass to use this feature.
+  if (FLAGS_arg_name != "") {
+    auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
+    auto arg_shape = arg_tensor->shape();
+    int arg_num = 1;
+    std::ostringstream os;
+    os << "{";
+    for (int i = 0; i < arg_shape.size(); ++i) {
+      arg_num *= arg_shape[i];
+      os << arg_shape[i] << ",";
+    }
+    os << "}";
+    float sum = 0.;
+    std::ofstream out(FLAGS_arg_name + ".txt");
+    for (size_t i = 0; i < arg_num; ++i) {
+      sum += arg_tensor->data<float>()[i];
+      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+    }
+    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
+              << ", mean value is " << sum * 1. / arg_num;
+  }
+}
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+  }
+
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+
+  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    LOG(INFO) << "input shape: " << str_input_shapes[i];
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+  }
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  paddle::lite_api::Run(
+      input_shapes,
+      save_optimized_model_dir,
+      static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
+      FLAGS_threads,
+      FLAGS_repeats,
+      FLAGS_warmup);
+#endif
+  return 0;
+}
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
@@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
 #define GEMM_INT8_INT8_OUT                                         \
  GEMM_TRANS_INT32_TO_FP32                                         \
  GEMM_INT8_RELU                                                   \
+  "ld1    {v8.4s},   [%[vmax]] \n"          /* v8 = -127 */        \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v16.4s, v8.4s\n"                                   \
+  "fcmge v1.4s, v17.4s, v8.4s\n"                                   \
+  "fcmge v2.4s, v18.4s, v8.4s\n"                                   \
+  "fcmge v3.4s, v19.4s, v8.4s\n"                                   \
+  "fcmge v4.4s, v20.4s, v8.4s\n"                                   \
+  "fcmge v5.4s, v21.4s, v8.4s\n"                                   \
+  "fcmge v6.4s, v22.4s, v8.4s\n"                                   \
+  "fcmge v7.4s, v23.4s, v8.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v16.16b, v8.16b, v0.16b            \n"                      \
+  "bif v17.16b, v8.16b, v1.16b            \n"                      \
+  "bif v18.16b, v8.16b, v2.16b            \n"                      \
+  "bif v19.16b, v8.16b, v3.16b            \n"                      \
+  "bif v20.16b, v8.16b, v4.16b            \n"                      \
+  "bif v21.16b, v8.16b, v5.16b            \n"                      \
+  "bif v22.16b, v8.16b, v6.16b            \n"                      \
+  "bif v23.16b, v8.16b, v7.16b            \n"                      \
  "fcvtas v0.4s, v16.4s\n"        /*  00, cvt to int */            \
  "fcvtas v1.4s, v17.4s\n"        /*  01, cvt to int */            \
  "fcvtas v2.4s, v18.4s\n"        /*  02, cvt to int */            \
@@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
  "fcvtas v5.4s, v21.4s\n"        /*  11, cvt to int */            \
  "fcvtas v6.4s, v22.4s\n"        /*  12, cvt to int */            \
  "fcvtas v7.4s, v23.4s\n"        /*  13, cvt to int */            \
+  /* data >= -127 */                                               \
+  "fcmge v16.4s, v24.4s, v8.4s\n"                                   \
+  "fcmge v17.4s, v25.4s, v8.4s\n"                                   \
+  "fcmge v18.4s, v26.4s, v8.4s\n"                                   \
+  "fcmge v19.4s, v27.4s, v8.4s\n"                                   \
+  "fcmge v20.4s, v28.4s, v8.4s\n"                                   \
+  "fcmge v21.4s, v29.4s, v8.4s\n"                                   \
+  "fcmge v22.4s, v30.4s, v8.4s\n"                                   \
+  "fcmge v23.4s, v31.4s, v8.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v24.16b, v8.16b, v16.16b\n"                                  \
+  "bif v25.16b, v8.16b, v17.16b\n"                                  \
+  "bif v26.16b, v8.16b, v18.16b\n"                                  \
+  "bif v27.16b, v8.16b, v19.16b\n"                                  \
+  "bif v28.16b, v8.16b, v20.16b\n"                                  \
+  "bif v29.16b, v8.16b, v21.16b\n"                                  \
+  "bif v30.16b, v8.16b, v22.16b\n"                                  \
+  "bif v31.16b, v8.16b, v23.16b\n"                                  \
  "sqxtn  v16.4h, v0.4s\n"        /*  00, cvt int32 to int16 */    \
  "fcvtas v8.4s, v24.4s\n"        /*  20, cvt to int */            \
  "sqxtn2 v16.8h, v1.4s\n"        /*  01, cvt int32 to int16 */    \
@@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                 "v9","v10","v11","v12","v13","v14",
                 "v15","v16","v17","v18","v19","v20",
                 "v21","v22","v23","v24","v25","v26",
-                 "v27","v28","v29","v30","v31","cc");
+                 "v27","v28","v29","v30","v31","cc", "memory");
  // clang-format on
 }

@@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                             int k,
                             int rem) {
  // clang-format off
+  float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
               : [a_ptr] "+r"(a_ptr),
                 [b_ptr] "+r"(b_ptr),
@@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
               : [is_relu] "r"(is_relu),
                 [bias] "r"(bias),
                 [rem] "r"(rem),
-                 [scale] "r"(scale)
+                 [scale] "r"(scale),
+                 [vmax] "r"(vmax)
               : "v0","v1","v2","v3","v4","v5","v6","v7",
                 "v8","v9","v10","v11","v12",
                 "v13","v14","v15","v16","v17",
                 "v18","v19","v20","v21","v22",
                 "v23","v24","v25","v26","v27",
-                 "v28","v29","v30","v31","cc");
+                 "v28","v29","v30","v31","cc", "memory");
  // clang-format on
 }

@@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
 #define GEMM_SDOT_INT8_OUT                                      \
  GEMM_SDOT_CVT_INT32_TO_FP32                                   \
  GEMM_SDOT_RELU                                                \
+  "ld1  {v6.4s}, [%[vmax]]\n"     /* v8 = -127.f     */            \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v8.4s, v6.4s\n"                                   \
+  "fcmge v1.4s, v9.4s, v6.4s\n"                                   \
+  "fcmge v2.4s, v10.4s, v6.4s\n"                                   \
+  "fcmge v3.4s, v11.4s, v6.4s\n"                                   \
+  "fcmge v4.4s, v12.4s, v6.4s\n"                                   \
+  "fcmge v5.4s, v13.4s, v6.4s\n"                                   \
+  "fcmge v7.4s, v14.4s, v6.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v8.16b, v6.16b, v0.16b\n"                                  \
+  "fcmge v0.4s, v15.4s, v6.4s\n"                                   \
+  "bif v9.16b, v6.16b, v1.16b\n"                                  \
+  "bif v10.16b, v6.16b, v2.16b\n"                                 \
+  "bif v11.16b, v6.16b, v3.16b\n"                                  \
+  "bif v12.16b, v6.16b, v4.16b\n"                                  \
+  "bif v13.16b, v6.16b, v5.16b\n"                                  \
+  "bif v14.16b, v6.16b, v7.16b\n"                                  \
+  "bif v15.16b, v6.16b, v0.16b \n"                                 \
  "fcvtas v0.4s, v8.4s\n"         /*  00, cvt to int */         \
  "fcvtas v1.4s, v9.4s\n"         /*  01, cvt to int */         \
  "fcvtas v2.4s, v10.4s\n"        /*  02, cvt to int */         \
@@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
  "sqxtn2 v12.8h, v4.4s\n"        /*  11, cvt int32 to int16 */ \
  "sqxtn  v13.4h, v5.4s\n"        /*  12, cvt int32 to int16 */ \
  "sqxtn  v14.4h, v6.4s\n"        /*  20, cvt int32 to int16 */ \
+  "ld1  {v6.4s}, [%[vmax]]\n"     /* v8 = -127.f     */            \
  "sqxtn2 v14.8h, v7.4s\n"        /*  21, cvt int32 to int16 */ \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v16.4s, v6.4s\n"                                   \
+  "fcmge v1.4s, v17.4s, v6.4s\n"                                   \
+  "fcmge v2.4s, v18.4s, v6.4s\n"                                   \
+  "fcmge v3.4s, v19.4s, v6.4s\n"                                   \
+  "fcmge v4.4s, v20.4s, v6.4s\n"                                   \
+  "fcmge v5.4s, v21.4s, v6.4s\n"                                   \
+  "fcmge v7.4s, v22.4s, v6.4s\n"                                   \
+  "fcmge v8.4s, v23.4s, v6.4s\n"                                   \
+  "fcmge v9.4s, v24.4s, v6.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v16.16b, v6.16b, v0.16b\n"                                  \
+  "fcmge v0.4s, v25.4s, v6.4s\n"                                   \
+  "bif v17.16b, v6.16b, v1.16b\n"                                  \
+  "bif v18.16b, v6.16b, v2.16b\n"                                  \
+  "bif v19.16b, v6.16b, v3.16b\n"                                  \
+  "bif v20.16b, v6.16b, v4.16b\n"                                  \
+  "bif v21.16b, v6.16b, v5.16b\n"                                  \
+  "bif v22.16b, v6.16b, v7.16b\n"                                  \
+  "bif v23.16b, v6.16b, v8.16b\n"                                  \
+  "bif v24.16b, v6.16b, v9.16b\n"                                  \
+  "bif v25.16b, v6.16b, v0.16b\n"                                  \
  "fcvtas v0.4s, v16.4s\n"        /*  22, cvt to int */         \
  "fcvtas v1.4s, v17.4s\n"        /*  30, cvt to int */         \
  "fcvtas v2.4s, v18.4s\n"        /*  31, cvt to int */         \
@@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
  "sqxtn  v19.4h, v6.4s\n"        /*  42, cvt int32 to int16 */ \
  "sqxtn  v20.4h, v7.4s\n"        /*  50, cvt int32 to int16 */ \
  "sqxtn2 v20.8h, v8.4s\n"        /*  51, cvt int32 to int16 */ \
+  "ld1  {v6.4s}, [%[vmax]]\n"     /* v8 = -127.f     */            \
  "sqxtn  v21.4h, v9.4s\n"        /*  52, cvt int32 to int16 */ \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v26.4s, v6.4s\n"                                   \
+  "fcmge v1.4s, v27.4s, v6.4s\n"                                   \
+  "fcmge v2.4s, v28.4s, v6.4s\n"                                   \
+  "fcmge v3.4s, v29.4s, v6.4s\n"                                   \
+  "fcmge v4.4s, v30.4s, v6.4s\n"                                   \
+  "fcmge v5.4s, v31.4s, v6.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v26.16b, v6.16b, v0.16b\n"                                  \
+  "bif v27.16b, v6.16b, v1.16b\n"                                  \
+  "bif v28.16b, v6.16b, v2.16b\n"                                  \
+  "bif v29.16b, v6.16b, v3.16b\n"                                  \
+  "bif v30.16b, v6.16b, v4.16b\n"                                  \
+  "bif v31.16b, v6.16b, v5.16b\n"                                  \
  "fcvtas v0.4s, v26.4s\n"        /*  60, cvt to int */         \
  "fcvtas v1.4s, v27.4s\n"        /*  61, cvt to int */         \
  "fcvtas v2.4s, v28.4s\n"        /*  62, cvt to int */         \
@@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
                                  int k,
                                  int tail) {
  // clang-format off
+  float32_t vmax[4] = {-127.0, -127.0, -127.0, -127.0};
  asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT
               : [a_ptr] "+r"(a_ptr),
                 [b_ptr] "+r"(b_ptr),
@@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
                 [c_ptr5] "+r"(c_ptr5),
                 [c_ptr6] "+r"(c_ptr6),
                 [c_ptr7] "+r"(c_ptr7)
-               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu)
+               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu), [vmax] "r"(vmax)
               : "cc","memory","v0","v1","v2","v3",
                 "v4","v5","v6","v7","v8","v9","v10",
                 "v11","v12","v13","v14","v15","v16","v17",
@@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
  "vadd.f32 q3, q11, q3\n"   /* r21, add offset */     \
  "vadd.f32 q4, q12, q4\n"   /* r30, add offset */     \
  "vadd.f32 q5, q13, q5\n"   /* r31, add offset */     \
+  "vld1.32 {d12-d13}, [%[vmax]]\n" /* set q4 = -127 \n"*/   \
+  "vcge.f32 q7, q8, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q10, q9, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q11, q0, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q12, q1, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q13, q2, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q14, q3, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q15, q4, q6\n"   /* @ q8 >= -127 \n */     \
+  /* choose data */                                    \
+  "vbif q8, q6, q7\n"       /* @ choose */            \
+  "vcge.f32 q7, q5, q6\n"   /* @ q8 >= -127 \n */     \
+  "vbif q9, q6, q10\n"       /* @ choose */             \
+  "vbif q0, q6, q11\n"       /* @ choose */           \
+  "vbif q1, q6, q12\n"       /* @ choose */           \
+  "vbif q2, q6, q13\n"       /* @ choose */           \
+  "vbif q3, q6, q14\n"       /* @ choose */           \
+  "vbif q4, q6, q15\n"       /* @ choose */           \
+  "vbif q5, q6, q7\n"       /* @ choose */           \
  "vcvt.s32.f32   q6, q8\n"  /* r00, fp32->int32 */    \
  "vcvt.s32.f32   q7, q9\n"  /* r01, fp32->int32 */    \
  "vcvt.s32.f32   q10, q0\n" /* r10, fp32->int32 */    \
@@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                 "q14",
                 "q15",
                 "r0",
-                 "cc");
+                 "cc",
+                 "memory");
 }

 template <>
@@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                             bool is_relu,
                             int k,
                             int rem) {
+  float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
               : [a_ptr] "+r"(a_ptr),
                 [b_ptr] "+r"(b_ptr),
@@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
               : [is_relu] "r"(is_relu),
                 [bias] "r"(bias),
                 [rem] "r"(rem),
+                 [vmax] "r"(vmax),
                 [scale] "r"(scale)
               : "q0",
                 "q1",
@@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                 "q14",
                 "q15",
                 "r0",
-                 "cc");
+                 "cc",
+                 "memory");
 }
 #endif  // __aarch64__ // NOLINT


--- a/lite/backends/arm/math/gemv_arm_int8.cc
+++ b/lite/backends/arm/math/gemv_arm_int8.cc
@@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in,
    for (int i = 0; i < size; ++i) {
      out[0] =
          saturate_cast<signed char>(roundf(*(in++) * *(scale++) + *(bias++)));
+      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
      if (flag_relu) {
        out[0] = out[0] > 0 ? out[0] : 0;
      }
@@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in,
  } else {
    for (int i = 0; i < size; ++i) {
      out[0] = saturate_cast<signed char>(roundf(*(in++) * *(scale++)));
+      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
      if (flag_relu) {
        out[0] = out[0] > 0 ? out[0] : 0;
      }

--- a/lite/backends/arm/math/type_trans.cc
+++ b/lite/backends/arm/math/type_trans.cc
@@ -40,13 +40,11 @@ void fp32_to_int8(const float* din,
  int cnt = inner_size / 16;
  int remain = inner_size & 15;
  int64_t loop_size = outer_size * axis_size;
-
 #pragma omp parallel for
  for (int j = 0; j < loop_size; ++j) {
    float inv_scale = 1.f / scale[j % axis_size];
    float32x4_t vzero = vdupq_n_f32(0.f);
    float32x4_t vscale = vdupq_n_f32(inv_scale);
-    float32x4_t vmax = vdupq_n_f32(-127.f);
    float32x4_t vpoff = vdupq_n_f32(0.5f);
    float32x4_t vnoff = vdupq_n_f32(-0.5f);
    const float* din_c = din + j * inner_size;
@@ -56,6 +54,7 @@ void fp32_to_int8(const float* din,
      const float* din_ptr = din_c;
      signed char* dout_ptr = dout_c;
 #ifdef __aarch64__
+      float32x4_t vmax = vdupq_n_f32(-127.0);
      asm volatile(
          "ldp q0, q1, [%[in]], #32                           \n"
          "ldp q2, q3, [%[in]], #32                   \n"
@@ -64,16 +63,19 @@ void fp32_to_int8(const float* din,
          "fmul v5.4s, v1.4s, %[scale].4s             \n"
          "fmul v6.4s, v2.4s, %[scale].4s             \n"
          "fmul v7.4s, v3.4s, %[scale].4s             \n"
+          /* data >= -127 */
          "fcmge v8.4s, v4.4s, %[vmax].4s             \n"
          "fcmge v9.4s, v5.4s, %[vmax].4s             \n"
          "fcmge v10.4s, v6.4s, %[vmax].4s            \n"
          "fcmge v11.4s, v7.4s, %[vmax].4s            \n"
+          /* choose data */
          "bif v4.16b, %[vmax].16b, v8.16b            \n"
          "bif v5.16b, %[vmax].16b, v9.16b            \n"
          "bif v6.16b, %[vmax].16b, v10.16b            \n"
          "bif v7.16b, %[vmax].16b, v11.16b            \n"
          "ldp q0, q1, [%[in]], #32                   \n"
          "subs %[cnt], %[cnt], #1                    \n"
+          /* fp32 - int32 */
          "FCVTAS v8.4s, v4.4s                        \n"
          "FCVTAS v9.4s, v5.4s                        \n"
          "FCVTAS v10.4s, v6.4s                       \n"
@@ -89,7 +91,9 @@ void fp32_to_int8(const float* din,
          "bne    0b                                  \n"
          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
          : [scale] "w"(vscale), [vmax] "w"(vmax)
-          : "v0",
+          : "cc",
+            "memory",
+            "v0",
            "v1",
            "v2",
            "v3",
@@ -102,6 +106,7 @@ void fp32_to_int8(const float* din,
            "v10",
            "v11");
 #else
+      float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
      asm volatile(
          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
@@ -113,23 +118,27 @@ void fp32_to_int8(const float* din,
          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
          "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
+          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-          "vcgt.f32   q8, q3, %q[vzero]          @ get mask > 0, in3\n"
          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
          "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q8          @ get right offset\n"
+          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
+          "vld1.32    {d0-d1}, [%[vmax]]          @ set q0 = -127 \n"
          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
          "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
          "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
-          "vcge.f32 q8, q4, %q[vmax]              @ q4 >= vmax \n"
-          "vcge.f32 q9, q5, %q[vmax]              @ q4 >= vmax \n"
-          "vcge.f32 q10, q6, %q[vmax]             @ q4 >= vmax \n"
-          "vbif q4, %q[vmax], q8                  @ choose \n"
-          "vcge.f32 q8, q7, %q[vmax]             @ q4 >= vmax \n"
-          "vbif q5, %q[vmax], q9                  @ choose \n"
-          "vbif q6, %q[vmax], q10                  @ choose \n"
-          "vbif q7, %q[vmax], q8                  @ choose \n"
+          /* data >= -127 */
+          "vcge.f32 q8, q4, q0                    @ q4 >= -127 \n"
+          "vcge.f32 q9, q5, q0                    @ q4 >= -127 \n"
+          "vcge.f32 q10, q6, q0                   @ q4 >= -127 \n"
+          "vcge.f32 q11, q7, q0                   @ q4 >= -127 \n"
+          /* choose data */
+          "vbif q4, q0, q8                        @ choose \n"
+          "vbif q5, q0, q9                        @ choose \n"
+          "vbif q6, q0, q10                       @ choose \n"
+          "vbif q7, q0, q11                       @ choose \n"
+          /* fp32 - int32 */
          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
          "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
@@ -150,9 +159,22 @@ void fp32_to_int8(const float* din,
          : [vscale] "w"(vscale),
            [vpoff] "w"(vpoff),
            [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero),
-            [vmax] "w"(vmax)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
+            [vmax] "r"(vmax),
+            [vzero] "w"(vzero)
+          : "cc",
+            "memory",
+            "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11");
 #endif
    }
    const float* din_r = din_c + 16 * cnt;
@@ -203,7 +225,7 @@ void fp32_to_int16(const float* din,
          "bne    0b                                  \n"
          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
          : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v8", "v9");
+          : "cc", "memory", "v0", "v1", "v4", "v5", "v8", "v9");
 #else
      asm volatile(
          "vld1.32 {d0-d3}, [%[din]]!             @ load in0~in7\n"
@@ -232,7 +254,7 @@ void fp32_to_int16(const float* din,
            [vpoff] "w"(vpoff),
            [vnoff] "w"(vnoff),
            [vzero] "w"(vzero)
-          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
+          : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
 #endif
    }
    const float* din_r = din_c + 8 * cnt;
@@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in,
          "bne     0b                         \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
          : [scale] "w"(vscale)
-          : "v0",
+          : "cc",
+            "memory",
+            "v0",
            "v1",
            "v2",
            "v3",
@@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in,
          "bne           0b                     \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 #endif  // __aarch64__
    }
    const signed char* din_r = din_c + 16 * cnt;
@@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in,
          "bne     0b                         \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
          : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11");
 #else
      asm volatile(
          "vld1.32    {d0-d3},    [%[in]]!            @ load 16 int16\n"
@@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in,
          "bne           0b                     \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 #endif  // __aarch64__
    }
    const int16_t* din_r = din_c + 16 * cnt;
@@ -473,7 +508,9 @@ void int32_to_fp32(const int* din,
          "bne     0b                         \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
          : [scale] "w"(vscale)
-          : "v0",
+          : "cc",
+            "memory",
+            "v0",
            "v1",
            "v2",
            "v3",
@@ -506,7 +543,9 @@ void int32_to_fp32(const int* din,
          "bne            0b                      \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
          : [scale] "w"(vscale)
-          : "q0",
+          : "cc",
+            "memory",
+            "q0",
            "q1",
            "q2",
            "q3",
@@ -551,41 +590,53 @@ void int32_to_int8(const int* din,
      const int* din_ptr = din_c;
      int8_t* dout_ptr = dout_c;
 #ifdef __aarch64__
+      float32x4_t vmax = vdupq_n_f32(-127.0);
      asm volatile(
          "0:                                        \n"
          "ld1     {v0.4s, v1.4s}, [%[in]], #32      \n"
          "ld1     {v2.4s, v3.4s}, [%[in]], #32      \n"
-
+          /* int32 - fp32 */
          "scvtf   v4.4s, v0.4s                      \n"
          "scvtf   v5.4s, v1.4s                      \n"
          "scvtf   v6.4s, v2.4s                      \n"
          "scvtf   v7.4s, v3.4s                      \n"
-
+          /* mul scale */
          "fmul    v0.4s, v4.4s, %[scale].4s         \n"
          "fmul    v1.4s, v5.4s, %[scale].4s         \n"
          "fmul    v2.4s, v6.4s, %[scale].4s         \n"
          "fmul    v3.4s, v7.4s, %[scale].4s         \n"
-
+          /* data >= -127 */
+          "fcmge v4.4s, v0.4s, %[vmax].4s             \n"
+          "fcmge v5.4s, v1.4s, %[vmax].4s             \n"
+          "fcmge v6.4s, v2.4s, %[vmax].4s            \n"
+          "fcmge v7.4s, v3.4s, %[vmax].4s            \n"
+          /* choose data */
+          "bif v0.16b, %[vmax].16b, v4.16b            \n"
+          "bif v1.16b, %[vmax].16b, v5.16b            \n"
+          "bif v2.16b, %[vmax].16b, v6.16b            \n"
+          "bif v3.16b, %[vmax].16b, v7.16b            \n"
+          /* fp32 - int32 */
          "fcvtas  v4.4s, v0.4s                      \n"
          "fcvtas  v5.4s, v1.4s                      \n"
          "fcvtas  v6.4s, v2.4s                      \n"
          "fcvtas  v7.4s, v3.4s                      \n"
-
+          /* int32 - int16 */
          "sqxtn   v0.4h, v4.4s                      \n"
          "sqxtn2  v0.8h, v5.4s                      \n"
          "sqxtn   v1.4h, v6.4s                      \n"
          "sqxtn2  v1.8h, v7.4s                      \n"
-
+          /* int16 - int8 */
          "sqxtn   v2.8b, v0.8h                      \n"
          "sqxtn2  v2.16b, v1.8h                     \n"
-
+          /* store */
          "st1     {v2.16b}, [%[out]], #16           \n"
          "subs    %[loop], %[loop], #1              \n"
          "bne     0b                                \n"
          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+          : [scale] "w"(vscale), [vmax] "w"(vmax)
+          : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 #else
+      float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
      asm volatile(
          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
@@ -607,9 +658,21 @@ void int32_to_int8(const int* din,
          "vbif.f32   q2, %q[vnoff], q10          @ get right offset\n"
          "vbif.f32   q3, %q[vnoff], q11          @ get right offset\n"
          "vmla.f32   q0, q4, %q[vscale]          @ mul scale\n"
+          "vld1.32    {d8-d9}, [%[vmax]]          @ set q4 = -127 \n"
          "vmla.f32   q1, q5, %q[vscale]          @ mul scale\n"
          "vmla.f32   q2, q6, %q[vscale]          @ mul scale\n"
          "vmla.f32   q3, q7, %q[vscale]          @ mul scale\n"
+          /* data >= -127 */
+          "vcge.f32 q8, q0, q4                    @ q0 >= -127 \n"
+          "vcge.f32 q9, q1, q4                    @ q1 >= -127 \n"
+          "vcge.f32 q10, q2, q4                   @ q2 >= -127 \n"
+          "vcge.f32 q11, q3, q4                   @ q3 >= -127 \n"
+          /* choose data */
+          "vbif q0, q4, q8                        @ choose \n"
+          "vbif q1, q4, q9                        @ choose \n"
+          "vbif q2, q4, q10                       @ choose \n"
+          "vbif q3, q4, q11                       @ choose \n"
+          /* fp32 - int32 */
          "vcvt.s32.f32  q4, q0                   @ cvt to int32\n"
          "vcvt.s32.f32  q5, q1                   @ cvt to int32\n"
          "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
@@ -628,9 +691,12 @@ void int32_to_int8(const int* din,
          : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
          : [vscale] "w"(vscale),
            [vzero] "w"(vzero),
+            [vmax] "r"(vmax),
            [vnoff] "w"(vnoff),
            [vpoff] "w"(vpoff)
-          : "q0",
+          : "cc",
+            "memory",
+            "q0",
            "q1",
            "q2",
            "q3",
@@ -648,6 +714,7 @@ void int32_to_int8(const int* din,
    int8_t* dout_r = dout_c + 16 * cnt;
    for (int i = 0; i < remain; ++i) {
      dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
+      dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
    }
  }
 }
@@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) {
        "bne    0b                                 \n"
        : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 #else
    asm volatile(
        "vld1.32   {d0-d3}, [%[in]]!                        @ load 8 float\n"
@@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) {

        : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
        :
-        : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 #endif
    float32x2_t vmax_p =
        vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val));

--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(basic_test, true, "do all tests");
 DEFINE_bool(check_result, true, "check the result");

 DEFINE_int32(batch, 1, "batch size");
@@ -457,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                    const std::vector<int>& power_mode) {}
 #endif  // LITE_WITH_ARM

-#if 0   /// 3x3dw
+#if 1  /// 3x3dw
 TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
  if (FLAGS_basic_test) {
    for (auto& stride : {1, 2}) {
@@ -525,7 +525,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
 }
 #endif  /// 5x5dw

-#if 0   /// conv1x1s1
+#if 1  /// conv1x1s1
 TEST(TestConv1x1s1Int8, test_conv1x1s1) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 3, 8, 32}) {
@@ -562,7 +562,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
 }
 #endif  /// conv1x1s1

-#if 0   /// conv3x3s1
+#if 1  /// conv3x3s1
 TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 3, 8, 33}) {
@@ -602,7 +602,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
 }
 #endif  /// conv3x3s1

-#if 0   /// conv3x3s2
+#if 1  /// conv3x3s2
 TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 3, 31}) {

--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(basic_test, true, "do all tests");
 DEFINE_bool(check_result, true, "check the result");

 DEFINE_int32(M, 512, "gemm: M");