Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into jiansowa/img_nna

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into jiansowa/img_nna
test=develop
6a75ac5d · Jiansong Wang · 95b76164 · b595ea5d · 6a75ac5d · 6a75ac5d
71 changed file
--- a/cmake/device/huawei_ascend_npu.cmake
+++ b/cmake/device/huawei_ascend_npu.cmake
@@ -16,6 +16,11 @@ if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
  return()
 endif()

+# require -D_GLIBCXX_USE_CXX11_ABI=0 if GCC 7.3.0
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 # 1. path to Huawei Ascend Install Path
 if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -217,6 +217,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
    ENDIF()

+    IF(LITE_WITH_HUAWEI_ASCEND_NPU)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
+    ENDIF()
+
    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        ExternalProject_Add(
            ${TARGET_NAME}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -64,6 +64,7 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz")
        # data
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert_data.tar.gz")
    endif()
 endif()


--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -356,5 +356,13 @@ void MobileConfig::set_model_buffer(const char *model_buffer,
  model_from_memory_ = true;
 }

+// This is the method for allocating workspace_size according to L3Cache size
+void MobileConfig::SetArmL3CacheSize(L3CacheSetMethod method,
+                                     int absolute_val) {
+#ifdef LITE_WITH_ARM
+  lite::DeviceInfo::Global().SetArmL3CacheSize(method, absolute_val);
+#endif
+}
+
 }  // namespace lite_api
 }  // namespace paddle
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -32,6 +32,14 @@ using shape_t = std::vector<int64_t>;
 using lod_t = std::vector<std::vector<uint64_t>>;

 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
+// Methods for allocating L3Cache on Arm platform
+enum class L3CacheSetMethod {
+  kDeviceL3Cache = 0,  // Use the system L3 Cache size, best performance.
+  kDeviceL2Cache = 1,  // Use the system L2 Cache size, trade off performance
+                       // with less memory consumption.
+  kAbsolute = 2,       // Use the external setting.
+  // kAutoGrow = 3,   // Not supported yet, least memory consumption.
+};

 // return true if current device supports OpenCL model
 LITE_API bool IsOpenCLBackendValid();
@@ -294,6 +302,11 @@ class LITE_API MobileConfig : public ConfigBase {

  // NOTE: This is a deprecated API and will be removed in latter release.
  const std::string& param_buffer() const { return param_buffer_; }
+
+  // This is the method for allocating workspace_size according to L3Cache size
+  void SetArmL3CacheSize(
+      L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
+      int absolute_val = -1);
 };

 template <typename ConfigT>

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -107,7 +107,8 @@ TEST(CxxApi, share_external_data) {
 TEST(LightApi, run) {
  lite_api::MobileConfig config;
  config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
-
+  // disable L3 cache on workspace_ allocating
+  config.SetArmL3CacheSize(L3CacheSetMethod::kDeviceL2Cache);
  auto predictor = lite_api::CreatePaddlePredictor(config);

  auto inputs = predictor->GetInputNames();
@@ -148,6 +149,8 @@ TEST(MobileConfig, LoadfromMemory) {
  // set model buffer and run model
  lite_api::MobileConfig config;
  config.set_model_from_buffer(model_buffer);
+  // allocate 1M initial space for workspace_
+  config.SetArmL3CacheSize(L3CacheSetMethod::kAbsolute, 1024 * 1024);

  auto predictor = lite_api::CreatePaddlePredictor(config);
  auto input_tensor = predictor->GetInput(0);

--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
  PADDLE_DLSYM(NeuronModel_setOperandValue);
  PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
  PADDLE_DLSYM(NeuronModel_addOperation);
+  PADDLE_DLSYM(NeuronModel_addOperationExtension);
  PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
  PADDLE_DLSYM(NeuronCompilation_create);
  PADDLE_DLSYM(NeuronCompilation_free);
  PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_createForDevices);
  PADDLE_DLSYM(NeuronExecution_create);
  PADDLE_DLSYM(NeuronExecution_free);
  PADDLE_DLSYM(NeuronExecution_setInput);
  PADDLE_DLSYM(NeuronExecution_setOutput);
  PADDLE_DLSYM(NeuronExecution_compute);
-
+  PADDLE_DLSYM(Neuron_getDeviceCount);
+  PADDLE_DLSYM(Neuron_getDevice);
+  PADDLE_DLSYM(NeuronDevice_getName);
 #undef PADDLE_DLSYM
 }

@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
      model, type, inputCount, inputs, outputCount, outputs);
 }

+int NeuronModel_addOperationExtension(NeuronModel* model,
+                                      const char* name,
+                                      const char* vendor,
+                                      const NeuronDevice* device,
+                                      uint32_t inputCount,
+                                      const uint32_t* inputs,
+                                      uint32_t outputCount,
+                                      const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_addOperationExtension()(model,
+                                            name,
+                                            vendor,
+                                            device,
+                                            inputCount,
+                                            inputs,
+                                            outputCount,
+                                            outputs);
+}
+
 int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
                                         uint32_t inputCount,
                                         const uint32_t* inputs,
@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
      compilation);
 }

+int NeuronCompilation_createForDevices(NeuronModel* model,
+                                       const NeuronDevice* const* devices,
+                                       uint32_t numDevices,
+                                       NeuronCompilation** compilation) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_createForDevices()(
+          model, devices, numDevices, compilation);
+}
+
 int NeuronExecution_create(NeuronCompilation* compilation,
                           NeuronExecution** execution) {
  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
      execution);
 }
+
+int Neuron_getDeviceCount(uint32_t* numDevices) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
+      numDevices);
+}
+
+int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
+                                                                   device);
+}
+
+int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
+                                                                       name);
+}
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -42,12 +42,25 @@ class NeuronAdapter final {
                                                const uint32_t *,
                                                uint32_t,
                                                const uint32_t *);
+  using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
+                                                         const char *,
+                                                         const char *,
+                                                         const NeuronDevice *,
+                                                         uint32_t,
+                                                         const uint32_t *,
+                                                         uint32_t,
+                                                         const uint32_t *);
  using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
      NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
  using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                NeuronCompilation **);
  using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
  using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_createForDevices_Type =
+      int (*)(NeuronModel *,
+              const NeuronDevice *const *,
+              uint32_t,
+              NeuronCompilation **);
  using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                              NeuronExecution **);
  using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -59,6 +72,10 @@ class NeuronAdapter final {
  using NeuronExecution_setOutput_Type = int (*)(
      NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
  using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
+  using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
+  using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
+  using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
+                                            const char **);

  Neuron_getVersion_Type Neuron_getVersion() {
    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
@@ -105,6 +122,12 @@ class NeuronAdapter final {
    return NeuronModel_addOperation_;
  }

+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
+    CHECK(NeuronModel_addOperationExtension_ != nullptr)
+        << "Cannot load NeuronModel_addOperationExtension!";
+    return NeuronModel_addOperationExtension_;
+  }
+
  NeuronModel_identifyInputsAndOutputs_Type
  NeuronModel_identifyInputsAndOutputs() {
    CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
@@ -130,6 +153,12 @@ class NeuronAdapter final {
    return NeuronCompilation_finish_;
  }

+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
+    CHECK(NeuronCompilation_createForDevices_ != nullptr)
+        << "Cannot load NeuronCompilation_createForDevices!";
+    return NeuronCompilation_createForDevices_;
+  }
+
  NeuronExecution_create_Type NeuronExecution_create() {
    CHECK(NeuronExecution_create_ != nullptr)
        << "Cannot load NeuronExecution_create!";
@@ -160,6 +189,23 @@ class NeuronAdapter final {
    return NeuronExecution_compute_;
  }

+  Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
+    CHECK(Neuron_getDeviceCount_ != nullptr)
+        << "Cannot load Neuron_getDeviceCount!";
+    return Neuron_getDeviceCount_;
+  }
+
+  Neuron_getDevice_Type Neuron_getDevice() {
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    return Neuron_getDevice_;
+  }
+
+  NeuronDevice_getName_Type NeuronDevice_getName() {
+    CHECK(NeuronDevice_getName_ != nullptr)
+        << "Cannot load NeuronDevice_getName!";
+    return NeuronDevice_getName_;
+  }
+
 private:
  NeuronAdapter();
  NeuronAdapter(const NeuronAdapter &) = delete;
@@ -176,16 +222,23 @@ class NeuronAdapter final {
  NeuronModel_setOperandSymmPerChannelQuantParams_Type
      NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
  NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
+      nullptr};
  NeuronModel_identifyInputsAndOutputs_Type
      NeuronModel_identifyInputsAndOutputs_{nullptr};
  NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
  NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
  NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
+      nullptr};
  NeuronExecution_create_Type NeuronExecution_create_{nullptr};
  NeuronExecution_free_Type NeuronExecution_free_{nullptr};
  NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
  NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
  NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
+  Neuron_getDevice_Type Neuron_getDevice_{nullptr};
+  NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
 };
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
@@ -161,7 +161,7 @@ class ContextProjectFunctor {
                      sequence_width});

        if (up_pad > 0) {  // add up pad
-          int padding_rows = std::min(
+          int padding_rows = (std::min)(
              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));

          for (int k = 0; k < padding_rows; ++k) {
@@ -180,10 +180,10 @@ class ContextProjectFunctor {
        }
        if (down_pad > 0) {  // add down pad
          int down_pad_begin_row =
-              std::max(0,
-                       (sequence_height - context_start - context_length) + 1) +
+              (std::max)(
+                  0, (sequence_height - context_start - context_length) + 1) +
              1;
-          int padding_begin = std::max(0, context_start - sequence_height);
+          int padding_begin = (std::max)(0, context_start - sequence_height);
          int padding_size =
              sequence_height - context_start >= context_length
                  ? 1

--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -67,8 +67,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
            hend = AdaptEndIndex(ph, input_height, output_height);
          } else {
            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
          }
          for (int pw = 0; pw < output_width; ++pw) {
            if (adaptive) {
@@ -76,8 +76,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
              wend = AdaptEndIndex(pw, input_width, output_width);
            } else {
              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
            }

            T ele = pool_process.initial();
@@ -150,8 +150,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
            hend = AdaptEndIndex(ph, input_height, output_height);
          } else {
            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
          }
          for (int pw = 0; pw < output_width; ++pw) {
            if (adaptive) {
@@ -159,8 +159,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
              wend = AdaptEndIndex(pw, input_width, output_width);
            } else {
              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
            }
            int pool_size = (exclusive || adaptive)
                                ? (hend - hstart) * (wend - wstart)
@@ -228,12 +228,12 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          int hend = (std::min)(hstart + ksize_height, input_height);
+          hstart = (std::max)(hstart, 0);
          for (int pw = 0; pw < output_width; ++pw) {
            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            int wend = (std::min)(wstart + ksize_width, input_width);
+            wstart = (std::max)(wstart, 0);

            bool stop = false;
            for (int h = hstart; h < hend && !stop; ++h) {
@@ -337,8 +337,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
            dend = AdaptEndIndex(pd, input_depth, output_depth);
          } else {
            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
          }
          for (int ph = 0; ph < output_height; ++ph) {
            if (adaptive) {
@@ -346,8 +346,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
              hend = AdaptEndIndex(ph, input_height, output_height);
            } else {
              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
            }
            for (int pw = 0; pw < output_width; ++pw) {
              if (adaptive) {
@@ -355,8 +355,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
                wend = AdaptEndIndex(pw, input_width, output_width);
              } else {
                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
              }
              int output_idx = (pd * output_height + ph) * output_width + pw;
              T ele = pool_process.initial();
@@ -441,8 +441,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
            dend = AdaptEndIndex(pd, input_depth, output_depth);
          } else {
            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
          }
          for (int ph = 0; ph < output_height; ++ph) {
            if (adaptive) {
@@ -450,8 +450,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
              hend = AdaptEndIndex(ph, input_height, output_height);
            } else {
              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
            }
            for (int pw = 0; pw < output_width; ++pw) {
              if (adaptive) {
@@ -459,8 +459,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
                wend = AdaptEndIndex(pw, input_width, output_width);
              } else {
                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
              }

              int pool_size =
@@ -540,16 +540,16 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
      for (int c = 0; c < output_channels; ++c) {
        for (int pd = 0; pd < output_depth; ++pd) {
          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          int dend = (std::min)(dstart + ksize_depth, input_depth);
+          dstart = (std::max)(dstart, 0);
          for (int ph = 0; ph < output_height; ++ph) {
            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            int hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
            for (int pw = 0; pw < output_width; ++pw) {
              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              int wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
              bool stop = false;
              for (int d = dstart; d < dend && !stop; ++d) {
                for (int h = hstart; h < hend && !stop; ++h) {
@@ -651,8 +651,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
            hend = AdaptEndIndex(ph, input_height, output_height);
          } else {
            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
          }
          for (int pw = 0; pw < output_width; ++pw) {
            if (adaptive) {
@@ -660,8 +660,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
              wend = AdaptEndIndex(pw, input_width, output_width);
            } else {
              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
            }

            T1 ele = static_cast<T1>(-FLT_MAX);
@@ -794,8 +794,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
            dend = AdaptEndIndex(pd, input_depth, output_depth);
          } else {
            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
          }
          for (int ph = 0; ph < output_height; ++ph) {
            if (adaptive) {
@@ -803,8 +803,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
              hend = AdaptEndIndex(ph, input_height, output_height);
            } else {
              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
            }
            for (int pw = 0; pw < output_width; ++pw) {
              if (adaptive) {
@@ -812,8 +812,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
                wend = AdaptEndIndex(pw, input_width, output_width);
              } else {
                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
              }

              int output_idx = (pd * output_height + ph) * output_width + pw;

--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -35,7 +35,7 @@ inline static uint64_t MaximumSequenceLength(
  uint64_t seq_num = seq_offset.size() - 1;
  uint64_t max_seq_len = 0;
  for (size_t i = 0; i < seq_num; ++i) {
-    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
+    max_seq_len = (std::max)(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
  }
  return max_seq_len;
 }

--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -26,7 +26,7 @@ namespace x86 {

 static void SetNumThreads(int num_threads) {
 #ifdef PADDLE_WITH_MKLML
-  int real_num_threads = std::max(num_threads, 1);
+  int real_num_threads = (std::max)(num_threads, 1);
  x86::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
 #endif
@@ -52,14 +52,14 @@ static inline void RunParallelFor(const int64_t begin,
  }

 #ifdef PADDLE_WITH_MKLML
-  int64_t num_threads = std::min(GetMaxThreads(), end - begin);
+  int64_t num_threads = (std::min)(GetMaxThreads(), end - begin);
  if (num_threads > 1) {
 #pragma omp parallel num_threads(num_threads)
    {
      int64_t tid = omp_get_thread_num();
      int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
      int64_t begin_tid = begin + tid * chunk_size;
-      f(begin_tid, std::min(end, chunk_size + begin_tid));
+      f(begin_tid, (std::min)(end, chunk_size + begin_tid));
    }
    return;
  }

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -17,6 +17,7 @@
 #include <cstdarg>
 #include <string>
 #include <vector>
+#include "lite/api/paddle_api.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
 #ifdef LITE_WITH_MLU
@@ -27,6 +28,7 @@
 namespace paddle {
 namespace lite {

+using L3CacheSetMethod = lite_api::L3CacheSetMethod;
 #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))

 typedef enum {
@@ -65,11 +67,41 @@ class DeviceInfo {
  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
+  // Methods for allocating L3Cache on Arm platform
+  // Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
+  void SetArmL3CacheSize(
+      L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
+      int absolute_val = -1) {
+    l3_cache_method_ = method;
+    absolute_l3cache_size_ = absolute_val;
+    // Realloc memory for sgemm in this context.
+    workspace_.clear();
+    workspace_.Resize({llc_size()});
+    workspace_.mutable_data<int8_t>();
+  }
+
  int llc_size() const {
-    auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
-                                              : L2_cache_[active_ids_[0]];
+    auto size = absolute_l3cache_size_;
+    switch (l3_cache_method_) {
+      // kDeviceL3Cache = 0, use the system L3 Cache size, best performance.
+      case L3CacheSetMethod::kDeviceL3Cache:
+        size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
+                                             : L2_cache_[active_ids_[0]];
+        break;
+      // kDeviceL2Cache = 1, use the system L2 Cache size, trade off performance
+      // with less memory consumption.
+      case L3CacheSetMethod::kDeviceL2Cache:
+        size = L2_cache_[active_ids_[0]];
+        break;
+      // kAbsolute = 2, use the external setting.
+      case L3CacheSetMethod::kAbsolute:
+        break;
+      default:
+        LOG(FATAL) << "Error: unknown l3_cache_method_ !";
+    }
    return size > 0 ? size : 512 * 1024;
  }
+
  bool has_dot() const { return dot_[active_ids_[0]]; }
  bool has_fp16() const { return fp16_[active_ids_[0]]; }

@@ -121,6 +153,10 @@ class DeviceInfo {
  void RequestPowerRandHighMode(int shift_num, int thread_num);
  void RequestPowerRandLowMode(int shift_num, int thread_num);

+  // Methods for allocating L3Cache on Arm platform
+  // Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
+  L3CacheSetMethod l3_cache_method_{L3CacheSetMethod::kDeviceL3Cache};
+  int absolute_l3cache_size_{-1};
  DeviceInfo() = default;
 };
 #endif  // LITE_WITH_ARM

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -148,7 +148,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
          int cur_life =
              (*lifecycles)[TargetToStr(target_type)][var_name].second;
          (*lifecycles)[TargetToStr(target_type)][var_name].second =
-              std::max(max_lifecycle_, cur_life);
+              (std::max)(max_lifecycle_, cur_life);
        }
      }
      ++max_lifecycle_;

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -61,7 +61,7 @@ class StaticKernelPickPass : public mir::StmtPass {
    float final_score{-1.};
    Place winner_place{places[0]};
    const int kMax =
-        std::numeric_limits<core::KernelPickFactor::value_type>::max();
+        (std::numeric_limits<core::KernelPickFactor::value_type>::max)();
    size_t place_size = places.size();

    // NOTE: We compare kernel's place with place in valid_places to select the

--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
 lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})


 set(apu_subgraph_bridges
@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
        subgraph_bridge_softmax_op_apu
        subgraph_bridge_fc_op_apu
        subgraph_bridge_pool_op_apu
+	subgraph_bridge_conv_transpose_op_apu
+	subgraph_bridge_concat_op_apu
        CACHE INTERNAL "apu_subgraph_bridges")

 message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
--- a/lite/kernels/apu/bridges/concat_op.cc
+++ b/lite/kernels/apu/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto out_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // Process data layout axis change
+  if (axis == 1)
+    axis = 3;
+  else if (axis == 2)
+    axis = 1;
+  else if (axis == 3)
+    axis = 2;
+
+  // Limitation:
+  // All input tensors of NEURON_TENSOR_QUANT8_ASYMM must
+  // have the same scale and zeroPoint as the output tensor
+  CHECK(op_info->HasOutputScale(out_name));
+  auto output_scale = op_info->GetOutputScale(out_name)[0];
+
+  // Traverse all of input nodes
+  std::vector<std::shared_ptr<Node>> input_nodes;
+  NeuronOperandType xType;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+
+    CHECK(op_info->HasInputScale(x_name));
+    auto input_scale = op_info->GetInputScale(x_name)[0];
+
+    // Add x tensor type
+    xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+    xType.scale = input_scale;
+    xType.zeroPoint = 128;
+    xType.dimensionCount = x_dims.size();
+    std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                    (uint32_t)x_dims[2],
+                                    (uint32_t)x_dims[3],
+                                    (uint32_t)x_dims[1]};
+    xType.dimensions = &dims_x[0];
+    if (graph->Has(x_name)) {
+      VLOG(3) << "Graph has " << x_name;
+      if (graph->IsInput(x_name)) {
+        VLOG(3) << x_name << "is input and already exist";
+        x_name = "transpose_" + x_name;
+      }
+
+      if (graph->IsOutput(x_name)) {
+        VLOG(3) << x_name << "is input and output node";
+        x_name = "transpose_" + x_name;
+      }
+      x_node = graph->Get(x_name);
+    } else {
+      // Add input operand
+      if (graph->IsInput(x_name)) {
+        // Insert transpose for NCHW -> NHWC
+        insert_transpose_node(ctx,
+                              x_name,
+                              "transpose_" + x_name,
+                              {(uint32_t)x_dims[0],
+                               (uint32_t)x_dims[1],
+                               (uint32_t)x_dims[2],
+                               (uint32_t)x_dims[3]},
+                              dims_x,
+                              {0, 2, 3, 1},
+                              xType.scale,
+                              xType.zeroPoint);
+
+        // Change x_name because we add transpose op
+        x_name = "transpose_" + x_name;
+        x_node = graph->Get(x_name);
+      } else {
+        NeuronModel_addOperand(model, &xType);
+        x_node = graph->Add(x_name, dims_x);
+      }
+    }  // End of else
+    if (x_node == nullptr) return subgraph::FAILED;
+    input_nodes.push_back(x_node);
+
+    VLOG(3) << "input node x: " << x_node->index()
+            << ": input_scale: " << input_scale << " x_dims:" << x_dims[0]
+            << ":" << x_dims[1] << ":" << x_dims
+            << ", inType: " << xType.dimensions[0] << ":" << xType.dimensions[1]
+            << ":" << xType.dimensions[2] << ":" << xType.dimensions[3];
+  }  // End of for
+
+  if (input_nodes.size() != num) {
+    LOG(WARNING) << "Create input operand failed!";
+    return subgraph::FAILED;
+  }
+
+  // Add axis operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  // Add axis operand
+  std::shared_ptr<Node> axis_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // axis
+  axis_node = graph->Add(out_name + "_axis", dims_int32);
+  VLOG(3) << "axis:" << axis;
+
+  // Add out operand type
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+
+  // Add out operand
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index()
+          << ": output_scle: " << outType.scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Set axis value
+  int32_t axis_val[1] = {(int32_t)axis};
+  NeuronModel_setOperandValue(
+      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex;
+  for (auto& node : input_nodes) {
+    addInIndex.push_back(node->index());
+  }
+
+  addInIndex.push_back(axis_node->index());
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_CONCATENATION,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return subgraph::FAILED;
+  }
+
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return subgraph::FAILED;
+  }
+
+  return SUCCESS;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConcatConverter);
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK_EQ(strides.size(), 2L);
  CHECK_EQ(dilations.size(), 2L);
  bool is_depthwise_mode = ic == groups && oc == groups;
-  VLOG(3) << "is_depthwise_mode" << is_depthwise_mode;
+  VLOG(3) << "is_depthwise_mode: " << is_depthwise_mode;

  if (paddings.size() == 2L) {
    for (size_t i = 0; i < strides.size(); ++i) {
@@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto filter_scale = op_info->GetInputScale(filter_name);
  CHECK(op_info->HasOutputScale(output_name));
  auto output_scale = op_info->GetOutputScale(output_name)[0];
+  auto orig_output_scale = op_info->GetOutputScale(output_name)[0];

  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
          << " ,dilations: " << dilations[0] << ":" << dilations[1];
@@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::shared_ptr<Node> input_node = nullptr;
  if (graph->Has(input_name)) {
    VLOG(3) << "Graph has " << input_name;
-    // input operand already exist
+
+    if (graph->IsInput(input_name)) {
+      VLOG(3) << input_name << "is input and already exist";
+      input_name = "transpose_" + input_name;
+    }
+
+    if (graph->IsOutput(input_name)) {
+      VLOG(3) << input_name << "is input and output node";
+      input_name = "transpose_" + input_name;
+    }
    input_node = graph->Get(input_name);
  } else {
-    // add input operand
    if (graph->IsInput(input_name)) {
      // Insert transpose for NCHW -> NHWC
-      insert_transpose_node(
-          ctx,
-          input_name,
-          "transpose_" + input_name,
-          {input_dims[0], input_dims[1], input_dims[2], input_dims[3]},
-          dims_in,
-          {0, 2, 3, 1},
-          inType.scale,
-          inType.zeroPoint);
-
-      // change input_name
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+
      input_name = "transpose_" + input_name;
      input_node = graph->Get(input_name);
      if (input_node == nullptr) return subgraph::FAILED;
@@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      input_node = graph->Add(input_name, dims_in);
    }
  }
-  VLOG(3) << "input node idx" << input_node->index()
+  VLOG(3) << "input node idx: " << input_node->index()
          << ": input_scale: " << input_scale
          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
@@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add bias type
  NeuronOperandType biasType;

-  // Add filter type
-  // filter NCHW -> NHWC
+  // Add filter type, filter data re-layout NCHW -> NHWC
  Tensor transpose_filter;
  std::vector<uint32_t> dims_filter;

@@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    biasType.scale = 0;
  }

+  auto precision = filter->precision();
  std::shared_ptr<Node> filter_node = nullptr;
  if (1 == filter_scale.size()) {
-    NeuronModel_addOperand(model, &filterType);  // 1: filter
-    filter_node = graph->Add(filter_name, dims_filter);
+    NeuronModel_addOperand(model, &filterType);
+    filter_node = graph->Add(filter_name, dims_filter);  // Operand 1: filter
    VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
            << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
            << ":" << filterType.dimensions[1] << ":"
@@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      return subgraph::FAILED;
    }
  } else {
-    NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
+    NeuronModel_addOperand(model, &channelFilterType);  // Operand 1: filter
    filter_node = graph->Add(filter_name, dims_filter);
    VLOG(3) << "chennel filter node idx: " << filter_node->index()
            << " ,scale_count:" << filter_scale.size()
@@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add biasType node value
  // A 1-D tensor, of shape [depth_out], specifying the bias.
  // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias
-  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0
+  // should be of NEURON_TENSOR_INT32, with zeroPoint of 0
  // and bias_scale of 0. The actual scale of each value 'i' is equal
  // to bias_scale[i] = input_scale * filter_scale[i].
  biasType.type = NEURON_TENSOR_INT32;
@@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    for (int i = 0; i < bias_dims.size(); i++)
      dims_bias.push_back(bias_dims[i]);
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(bias_name, dims_bias);
-    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
            << " ,bias scale: " << biasType.scale
            << " ,dimensions: " << bias_dims;
  } else {
    biasType.dimensionCount = 1;
    dims_bias = {(uint32_t)output_dims[1]};
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias "
            << " ,bias scale: " << biasType.scale
@@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::vector<uint32_t> dims_int32 = {1};

  std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);

  std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);

  std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);

  std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);

  std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);

  std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);

  std::shared_ptr<Node> dm_node = nullptr;
  if (is_depthwise_mode) {
-    NeuronModel_addOperand(model, &int32Type);  // 9: depthwise multiplier
+    NeuronModel_addOperand(model,
+                           &int32Type);  // Operand 9: depthwise multiplier
    dm_node = graph->Add(filter_name + "_dm", dims_int32);
  }

  std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9/10: fuse
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9/10: fuse
  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);

+  /* Check output scale */
+  if (is_depthwise_mode) {
+    for (auto s : filter_scale) {
+      if (output_scale < s * input_scale)
+        output_scale = s * input_scale + 0.000001;
+    }
+#ifdef LITE_MEDIATEK_APU_ENABLE_REQUANT
+    output_scale = orig_output_scale;
+#endif
+  }
+
  // Add output tensor type
  NeuronOperandType outType;
  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (graph->Has(output_name)) {
    output_node = graph->Get(output_name);
  } else {
-    // add output operand
-    if (graph->IsOutput(output_name)) {
-      NeuronModel_addOperand(model, &outType);  // output
+    // Add output operand
+    NeuronModel_addOperand(model, &outType);
+
+    if (orig_output_scale != output_scale) {
+      // Need to insert requant op, the result is requant_ -> transpose_ ->
+      // output
+      output_node = graph->Add("requant_" + output_name, dims_out);
+    } else if (graph->IsOutput(output_name)) {
+      // Need to insert transpose op, transpose_ -> output
      output_node = graph->Add("transpose_" + output_name, dims_out);
    } else {
-      NeuronModel_addOperand(model, &outType);  // output
      output_node = graph->Add(output_name, dims_out);
    }
  }
@@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {

  // Add Stride
  int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
  NeuronModel_setOperandValue(
      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
  NeuronModel_setOperandValue(
      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);

@@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        model, dm_node->index(), &dm, sizeof(int32_t) * 1);
    VLOG(3) << "depthwise multiplier:" << dm;

-    // Depthwise conv
+    // Depthwise conv case
    NeuronModel_setOperandValue(
        model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
    std::vector<uint32_t> addInIndex = {
@@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    return FAILED;
  }

+  // Check if Requant OP is needed
+  std::shared_ptr<Node> requant_node = nullptr;
+  if (orig_output_scale != output_scale) {
+    std::string requant_out_name = output_name;
+    VLOG(3) << "Insert requant output scale, orig:" << orig_output_scale
+            << " ,output_scale:" << output_scale;
+    if (graph->IsOutput(output_name)) {
+      requant_out_name = "transpose_" + output_name;
+    }
+
+    insert_requant_node(ctx,
+                        "requant_" + output_name,
+                        requant_out_name,
+                        dims_out,
+                        dims_out,
+                        output_scale,
+                        orig_output_scale,
+                        outType.zeroPoint);
+
+    requant_node = graph->Get(requant_out_name);
+    if (requant_node == nullptr) return subgraph::FAILED;
+  }
+
+  std::shared_ptr<Node> transpose_node = nullptr;
  if (graph->IsOutput(output_name)) {
+    VLOG(3) << "Add output transpose:" << output_name;
    // Insert transpose for NHWC -> NCHW
-    insert_transpose_node(
-        ctx,
-        "transpose_" + output_name,
-        output_name,
-        dims_out,
-        {output_dims[0], output_dims[1], output_dims[2], output_dims[3]},
-        {0, 3, 1, 2},
-        outType.scale,
-        outType.zeroPoint);
-    output_node = graph->Get(output_name);
-    if (output_node == nullptr) return subgraph::FAILED;
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    transpose_node = graph->Get(output_name);
+    if (transpose_node == nullptr) return subgraph::FAILED;
  }

  return REBUILD_WHEN_SHAPE_CHANGED;

--- a/lite/kernels/apu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+#include "lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph *>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_EQ(input_dims.size(), 4);
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  CHECK_EQ(filter_dims.size(), 4);
+
+  auto output_name = op_info->Output("Output").front();
+
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK_EQ(strides.size(), 2L);
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  if (groups > 1) {
+    LOG(WARNING) << "[NPU] only support groups == 1";
+    return FAILED;
+  }
+
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  auto fuse_relu =
+      op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
+
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  CHECK_EQ(dilations.size(), 2L);
+  std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the input size."
+      << paddings.size();
+
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  std::vector<int> output_dims;
+  // Set output_dims: batches
+  output_dims.push_back(input_dims[0]);
+
+  std::vector<int> output_size;
+  if (op_info->HasAttr("output_size")) {
+    output_size = op_info->GetAttr<std::vector<int>>("output_size");
+  }
+
+  if (output_size.size() > 2) {
+    // Set output_dims: height, width
+    output_dims.push_back(output_size[0]);
+    output_dims.push_back(output_size[1]);
+  } else {
+    // Compute output size
+    for (int i = 0; i < strides.size(); i++) {
+      int kernel_ext = filter_dims[i + 2];
+      int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext -
+                        paddings[i * 2] - paddings[i * 2 + 1];
+      output_dims.push_back(output_size);
+    }
+  }
+  output_dims.push_back(filter_dims[1]);
+
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];
+
+  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
+          << " ,dilations: " << dilations[0] << ":" << dilations[1];
+  VLOG(3) << "with_act: " << with_act << " ,act_type: " << act_type;
+  VLOG(3) << "input_dims: " << input_dims
+          << " ,filter_scale size: " << filter_scale.size();
+  VLOG(3) << "filter_dims(Cin, Cout, H, W): " << filter_dims
+          << " ,memory_size: " << filter->memory_size()
+          << " ,data_size: " << filter->data_size();
+
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+  inType.dimensions = &dims_in[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Graph has " << input_name;
+    // Input operand already created by previous OP
+    input_node = graph->Get(input_name);
+  } else {
+    // Add input operand
+    if (graph->IsInput(input_name)) {
+      // Insert transpose for NCHW -> NHWC
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+
+      // Change input_name because we add transpose op
+      input_name = "transpose_" + input_name;
+      input_node = graph->Get(input_name);
+      if (input_node == nullptr) return subgraph::FAILED;
+    } else {
+      NeuronModel_addOperand(model, &inType);
+      input_node = graph->Add(input_name, dims_in);
+    }
+  }
+
+  VLOG(3) << "input node idx: " << input_node->index()
+          << ": input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
+          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
+
+  // Add bias type
+  NeuronOperandType biasType;
+
+  // Add filter type
+  // Relay out filter (Cin,Cout,H,W) -> (depth_out, h, w, depth_in)
+  Tensor transpose_filter;
+  std::vector<uint32_t> dims_filter;
+  transpose_filter.Resize({(uint32_t)filter_dims[1],
+                           (uint32_t)filter_dims[2],
+                           (uint32_t)filter_dims[3],
+                           (uint32_t)filter_dims[0]});
+
+  transposeAsym(filter->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {(uint32_t)filter_dims[0],
+                 (uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3]},
+                {1, 2, 3, 0});
+
+  dims_filter = {(uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3],
+                 (uint32_t)filter_dims[0]};
+
+  NeuronOperandType filterType;
+  filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  filterType.scale = filter_scale[0];
+  filterType.zeroPoint = 128;
+  filterType.dimensionCount = filter_dims.size();
+  filterType.dimensions = &dims_filter[0];
+  biasType.scale = inType.scale * filterType.scale;
+
+  std::shared_ptr<Node> filter_node = nullptr;
+  NeuronModel_addOperand(model, &filterType);
+  filter_node = graph->Add(filter_name, dims_filter);
+  auto precision = filter->precision();
+  VLOG(3) << " filter node idx: " << filter_node->index()
+          << " filter_scale[0]=" << filter_scale[0]
+          << " filter memory_size=" << filter->memory_size()
+          << " filter precision=" << PrecisionToStr(precision)
+          << " :filterType: " << filterType.dimensions[0] << ":"
+          << filterType.dimensions[2] << ":" << filterType.dimensions[2] << ":"
+          << filterType.dimensions[3];
+
+  memcpy(filter->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         filter->memory_size());
+
+  // Set filter value
+  neuron_errCode = NeuronModel_setOperandValue(
+      model, filter_node->index(), filter->raw_data(), filter->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  // Add biasType node value
+  // A 1-D tensor, of shape [depth_out], specifying the bias.
+  // For filter tensor of NEURON_TENSOR_QUANT8_ASYMM, the bias should be of
+  // NEURON_TENSOR_INT32 with zeroPoint of 0 and bias_scale ==
+  // input_scale * filter_scale
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  std::vector<uint32_t> dims_bias;
+  std::shared_ptr<Node> bias_node = nullptr;
+
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    auto channel_size = bias->dims().production();
+    CHECK_EQ(channel_size, filter_dims[1] * groups);
+    CHECK_EQ(bias_dims.size(), 1);
+
+    biasType.dimensionCount = bias_dims.size();
+    for (int i = 0; i < bias_dims.size(); i++)
+      dims_bias.push_back(bias_dims[i]);
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << bias_dims
+            << " ,channel_size:" << channel_size;
+
+  } else {
+    // Create default bias with value 0
+    biasType.dimensionCount = 1;
+    dims_bias = {(uint32_t)output_dims[1]};
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: default_bias "
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << dims_bias.size();
+  }
+
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
+  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
+
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
+  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
+
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
+  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
+
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
+  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
+
+  std::shared_ptr<Node> strideW_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
+  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
+
+  std::shared_ptr<Node> strideH_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
+  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
+
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
+
+  NeuronOperandType boolType;
+  boolType.type = NEURON_BOOL;
+  boolType.dimensionCount = 0;  // Must be 0 for scalars.
+  std::shared_ptr<Node> layout_node = nullptr;
+  NeuronModel_addOperand(model, &boolType);  // Operand 9: fuse
+  layout_node = graph->Add(filter_name + "_layout", dims_int32);
+
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = output_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
+                                    (uint32_t)output_dims[1],
+                                    (uint32_t)output_dims[2],
+                                    (uint32_t)output_dims[3]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> output_node = nullptr;
+  if (graph->Has(output_name)) {
+    output_node = graph->Get(output_name);
+  } else {
+    if (graph->IsOutput(output_name)) {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add("transpose_" + output_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add(output_name, dims_out);
+    }
+  }
+  VLOG(3) << "output node idx: " << output_node->index()
+          << ": output_scale: " << outType.scale
+          << " ,outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Add bias value
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+
+    int32_t *int32_bias_data =
+        reinterpret_cast<int32_t *>(bias->mutable_data<float>());
+    float2int32(
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);
+
+    VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << ":"
+            << int32_bias_data[1] << ":" << int32_bias_data[2] << ":"
+            << int32_bias_data[3];
+
+    neuron_errCode = NeuronModel_setOperandValue(
+        model, bias_node->index(), bias->raw_data(), bias->memory_size());
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, output_dims[3]});
+    int32_bias->mutable_data<int32_t>();
+    VLOG(3) << "bais_default: " << int32_bias->memory_size();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    neuron_errCode = NeuronModel_setOperandValue(model,
+                                                 bias_node->index(),
+                                                 int32_bias->raw_data(),
+                                                 int32_bias->memory_size());
+    bias_node->set_data(int32_bias);
+  }
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":"
+          << paddings[2] << ":" << paddings[3];
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  NeuronModel_setOperandValue(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  NeuronModel_setOperandValue(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  NeuronModel_setOperandValue(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  NeuronModel_setOperandValue(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+
+  VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0];
+
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // entry 1: width stride
+  NeuronModel_setOperandValue(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // entry 0: height stride
+  NeuronModel_setOperandValue(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
+  if (act_type == "relu") {
+    fuse_val[0] = NEURON_FUSED_RELU;
+  } else if (act_type == "relu1") {
+    fuse_val[0] = NEURON_FUSED_RELU1;
+  } else if (act_type == "relu6") {
+    fuse_val[0] = NEURON_FUSED_RELU6;
+  } else if (!act_type.empty()) {
+    fuse_val[0] = NEURON_FUSED_NONE;
+    LOG(WARNING) << "Support act_type: " << act_type;
+    return FAILED;
+  }
+
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  bool layout_val[] = {false};
+  NeuronModel_setOperandValue(
+      model, layout_node->index(), layout_val, sizeof(bool) * 1);
+
+  std::vector<uint32_t> addInIndex = {
+      input_node->index(),     // 0: input
+      filter_node->index(),    // 1: filter
+      bias_node->index(),      // 2: bias
+      paddingL_node->index(),  // 3: padding left
+      paddingR_node->index(),  // 4: padding right
+      paddingT_node->index(),  // 5: padding top
+      paddingB_node->index(),  // 6: padding bottom
+      strideW_node->index(),   // 7: stride width
+      strideH_node->index(),   // 8: stride height
+      fuse_node->index(),      // 9: fuse
+      layout_node->index()};   // 10: layout
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_TRANSPOSE_CONV_2D,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(output_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    output_node = graph->Get(output_name);
+    if (output_node == nullptr) return subgraph::FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConvTransposeConverter);
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
@@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto op_info = op->op_info();
  auto op_type = op_info->Type();
  auto scope = op->scope();
-  VLOG(3) << "[APU] Converting " + op_type + "...";
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" + op_type + "]";

  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
  auto x_dims = x->dims();

  auto y_name = op_info->Input("Y").front();
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
  auto y_dims = y->dims();

  auto out_name = op_info->Output("Out").front();
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
  auto out_dims = out->dims();
+
  auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+
+  auto x_shape = x_dims.Vectorize();
+  auto y_shape = y_dims.Vectorize();
+
+  // Two dimensions are compatible when:
+  // 1. they are equal, or
+  // 2. one of them is 1
+  for (int i = axis; i < x_shape.size(); i++) {
+    if (x_dims[i] != y_dims[i - axis]) {
+      // Input 1 compatible dimensions as input0
+      if (y_dims[i - axis] != 1) {
+        LOG(WARNING) << i << ":" << axis << ":" << y_dims[i - axis];
+        return FAILED;
+      }
+    }
+  }  // End of for

+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
  // Act node
  if (op_type == "fusion_elementwise_add_activation" ||
      op_type == "fusion_elementwise_sub_activation" ||
      op_type == "fusion_elementwise_mul_activation" ||
      op_type == "fusion_elementwise_div_activation") {
    auto act_type = op_info->GetAttr<std::string>("act_type");
+
+    if (act_type == "relu") {
+      fuse_val[0] = NEURON_FUSED_RELU;
+    } else if (act_type == "relu1") {
+      fuse_val[0] = NEURON_FUSED_RELU1;
+    } else if (act_type == "relu6") {
+      fuse_val[0] = NEURON_FUSED_RELU6;
+    } else if (!act_type.empty()) {
+      fuse_val[0] = NEURON_FUSED_NONE;
+      LOG(WARNING) << "Support act_type: " << act_type;
+      return FAILED;
+    }
+  }  // End of if
+  VLOG(3) << "x_name" << x_name;
+
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasInputScale(y_name));
+  auto y_scale = op_info->GetInputScale(y_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
+
+  // Add x tensor type
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = x_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                  (uint32_t)x_dims[2],
+                                  (uint32_t)x_dims[3],
+                                  (uint32_t)x_dims[1]};
+  xType.dimensions = &dims_x[0];
+
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    VLOG(3) << "Graph has " << x_name;
+    if (graph->IsInput(x_name)) {
+      VLOG(3) << x_name << "is input and already exist";
+      x_name = "transpose_" + x_name;
+    }
+
+    if (graph->IsOutput(x_name)) {
+      VLOG(3) << x_name << "is input and output node";
+      x_name = "transpose_" + x_name;
+    }
+    x_node = graph->Get(x_name);
+  } else {
+    if (graph->IsInput(x_name)) {
+      insert_transpose_node(ctx,
+                            x_name,
+                            "transpose_" + x_name,
+                            {(uint32_t)x_dims[0],
+                             (uint32_t)x_dims[1],
+                             (uint32_t)x_dims[2],
+                             (uint32_t)x_dims[3]},
+                            dims_x,
+                            {0, 2, 3, 1},
+                            xType.scale,
+                            xType.zeroPoint);
+
+      // Change x name after insert transpose op for x data relayout
+      x_name = "transpose_" + x_name;
+      x_node = graph->Get(x_name);
+    } else {
+      NeuronModel_addOperand(model, &xType);
+      x_node = graph->Add(x_name, dims_x);
+    }
+  }  // End of else
+  VLOG(3) << "x node idx: " << x_node->index() << "x_dims: " << x_dims
+          << ": x_scale: " << x_scale << ", xType: " << xType.dimensions[0]
+          << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
+          << xType.dimensions[3];
+
+  // Add y tensor type
+  NeuronOperandType yType;
+  yType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  yType.scale = y_scale;
+  yType.zeroPoint = 128;
+  yType.dimensionCount = y_dims.size();
+  std::vector<uint32_t> dims_y = {(uint32_t)y_dims[0],
+                                  (uint32_t)y_dims[2],
+                                  (uint32_t)y_dims[3],
+                                  (uint32_t)y_dims[1]};
+  yType.dimensions = &dims_y[0];
+
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    VLOG(3) << "Graph has " << y_name;
+    y_node = graph->Get(y_name);
+  } else {
+    if (graph->IsInput(y_name)) {
+      insert_transpose_node(ctx,
+                            y_name,
+                            "transpose_" + y_name,
+                            {(uint32_t)y_dims[0],
+                             (uint32_t)y_dims[1],
+                             (uint32_t)y_dims[2],
+                             (uint32_t)y_dims[3]},
+                            dims_y,
+                            {0, 2, 3, 1},
+                            yType.scale,
+                            yType.zeroPoint);
+
+      y_name = "transpose_" + y_name;
+      y_node = graph->Get(y_name);
+    } else {
+      NeuronModel_addOperand(model, &yType);
+      y_node = graph->Add(y_name, dims_y);
+    }
+  }
+  VLOG(3) << "y node idx: " << y_node->index() << "y_dims: " << y_dims
+          << ": y_scale: " << y_scale << ", yType: " << yType.dimensions[0]
+          << ":" << yType.dimensions[1] << ":" << yType.dimensions[2] << ":"
+          << yType.dimensions[3];
+
+  // Add fuse operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  // Add fuse operand
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: fuse
+  fuse_node = graph->Add(out_name + "_fuse", dims_int32);
+
+  // Add out tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    VLOG(3) << "Graph has " << out_name;
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index() << "out_dims: " << out_dims
+          << ": out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Set fuse value
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex = {
+      x_node->index(),      // 0: A tensor
+      y_node->index(),      // 1: A tensor of the same OperandCode,
+                            //    and compatible dimensions as input 0
+      fuse_node->index()};  // 2: fuse
+
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    neuron_errCode = NeuronModel_addOperation(model,
+                                              NEURON_ADD,
+                                              addInIndex.size(),
+                                              &addInIndex[0],
+                                              addOutIndex.size(),
+                                              &addOutIndex[0]);
+  } else {
+    LOG(WARNING) << "[APU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "ADD op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return FAILED;
  }

  return REBUILD_WHEN_SHAPE_CHANGED;
@@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
 REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
                         kAPU,
                         paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  inType.dimensions = &dims_in[0];
  std::shared_ptr<Node> in_node = nullptr;
  if (graph->Has(input_name)) {
-    // input operand already exist
    in_node = graph->Get(input_name);
    VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
  } else {
-    // add input operand
-    NeuronModel_addOperand(model, &inType);  // 0: input
+    NeuronModel_addOperand(model, &inType);  // Operand 0: input
    in_node = graph->Add(input_name, dims_in);
  }
  VLOG(3) << "input_scale: " << input_scale
@@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  wType.dimensionCount = w_dims.size();
  std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
  wType.dimensions = &dims_w[0];
-  NeuronModel_addOperand(model, &wType);  // 1: weight
+  NeuronModel_addOperand(model, &wType);  // Operand 1: weight
  std::shared_ptr<Node> w_node = nullptr;
  w_node = graph->Add(w_name, dims_w);
  VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
@@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    biasType.dimensionCount = bias_dims.size();
    std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(bias_name, dims_bias);
    VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
            << ", bias scale: " << biasType.scale
@@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    biasType.dimensionCount = 1;
    std::vector<uint32_t> dims_bias = {(uint32_t)n};
    biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
    bias_node = graph->Add(w_name + "_default_bias", dims_bias);
  }

@@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  fuseType.type = NEURON_INT32;
  fuseType.dimensionCount = 0;
  std::vector<uint32_t> dims_int32 = {0};
-  NeuronModel_addOperand(model, &fuseType);  // 3: fuse
+  NeuronModel_addOperand(model, &fuseType);  // Operand 3: fuse
  std::shared_ptr<Node> fuse_node = nullptr;
  fuse_node = graph->Add(w_name + "_fuse", dims_int32);

@@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  outType.scale = out_scale;
  outType.zeroPoint = 128;
  outType.dimensionCount = 2;
-  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[1]};
  outType.dimensions = &dims_out[0];
  VLOG(3) << "out_scale: " << out_scale
          << ", outType: " << outType.dimensions[0] << " : "
          << outType.dimensions[1];
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
  std::shared_ptr<Node> out_node = nullptr;
  out_node = graph->Add(out_name, dims_out);

@@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        NeuronModel_setOperandValue(model,
                                    bias_node->index(),
                                    bias->raw_data(),
-                                    bias->memory_size());  // 2: bias
+                                    bias->memory_size());  // Operand 2: bias
  } else {
    auto int32_bias = std::make_shared<Tensor>();
    int32_bias->Resize({1, out_dims[1]});
    int32_bias->mutable_data<int32_t>();
    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
    VLOG(3) << "default: " << int32_bias->memory_size();
-    neuron_errCode =
-        NeuronModel_setOperandValue(model,
-                                    bias_node->index(),
-                                    int32_bias->raw_data(),
-                                    int32_bias->memory_size());  // 2: bias
+    neuron_errCode = NeuronModel_setOperandValue(
+        model,
+        bias_node->index(),
+        int32_bias->raw_data(),
+        int32_bias->memory_size());  // Operand 2: bias
    bias_node->set_data(int32_bias);
  }
  // Add fuse value
  int32_t fuse_val[1] = {0};
-  NeuronModel_setOperandValue(
-      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
-
-  std::vector<uint32_t> addInIndex = {in_node->index(),
-                                      w_node->index(),
-                                      bias_node->index(),
-                                      fuse_node->index()};
+  NeuronModel_setOperandValue(model,
+                              fuse_node->index(),
+                              fuse_val,
+                              sizeof(int32_t) * 1);  // Operand 3: fuse
+
+  std::vector<uint32_t> addInIndex = {in_node->index(),     // 0: input
+                                      w_node->index(),      // 1: weight
+                                      bias_node->index(),   // 2: bias
+                                      fuse_node->index()};  // 3: fuse
  std::vector<uint32_t> addOutIndex = {out_node->index()};
  neuron_errCode = NeuronModel_addOperation(model,
                                            NEURON_FULLY_CONNECTED,

--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
    LOG(FATAL) << "[APU] Node" << name << " is redefined.";
    return -1;
  } else {
-    VLOG(3) << " Add: " << name << " : " << node->index();
+    VLOG(5) << " Add: " << name << " : " << node->index();
    auto ret = nodes_.insert(
        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
    CHECK(ret.second);

--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
 USE_SUBGRAPH_BRIDGE(fc, kAPU);
 USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
 USE_SUBGRAPH_BRIDGE(softmax, kAPU);
+USE_SUBGRAPH_BRIDGE(concat, kAPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -47,14 +47,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");

-  // pool mode
+  // Check pool mode
  if ((pooling_type == "max") || (pooling_type == "avg")) {
  } else {
    LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
    return FAILED;
  }

-  // pad mode
+  // Check padding mode
  int pad_mode = 0;
  std::string padding_algorithm("");
  if (op_info->HasAttr("padding_algorithm")) {
@@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    pad_mode = 5;
  }

-  // paddings and strides
+  // Check paddings and strides
  if (paddings.size() == 2L) {
    for (size_t i = 0; i < 2L; ++i) {
      int copy_pad = *(paddings.begin() + 2 * i);
@@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  xType.dimensions = &dims_x[0];
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
-    LOG(INFO) << "Graph has " << x_name;
-    // input operand already exist
+    VLOG(3) << "Graph has " << x_name;
    x_node = graph->Get(x_name);
  } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: x
+    NeuronModel_addOperand(model, &xType);  // Operand 0: x
    x_node = graph->Add(x_name, dims_x);
  }
  VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
          << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
          << xType.dimensions[3];

+  VLOG(3) << "ksize:" << ksize[0] << ":" << ksize[1];
+
  NeuronOperandType int32Type;
  int32Type.type = NEURON_INT32;
  int32Type.dimensionCount = 0;
  std::vector<uint32_t> dims_int32 = {0};

  std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 1: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 1: padding left
  paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);

  std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 2: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: padding right
  paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);

  std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding top
  paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);

  std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding bottom
  paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);

  std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: stride width
  strideW_node = graph->Add(x_name + "_stride_width", dims_int32);

  std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: stride height
  strideH_node = graph->Add(x_name + "_stride_height", dims_int32);

  std::shared_ptr<Node> filterW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: filter width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: filter width
  filterW_node = graph->Add(x_name + "_filter_width", dims_int32);

  std::shared_ptr<Node> filterH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: filter height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: filter height
  filterH_node = graph->Add(x_name + "_filter_height", dims_int32);

  std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9: fuse
-  fuse_node = graph->Add(x_name + "_fuse", dims_int32);
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(x_name + "_pool_fuse", dims_int32);

-  // Add out type
  // Add output tensor type
  NeuronOperandType outType;
  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (graph->Has(out_name)) {
    out_node = graph->Get(out_name);
  } else {
-    NeuronModel_addOperand(model, &outType);  // out
+    NeuronModel_addOperand(model, &outType);
    out_node = graph->Add(out_name, dims_out);
  }
-  VLOG(3) << "output_scale: " << x_scale
+  VLOG(3) << "output_scale: " << out_scale
          << ", outType: " << outType.dimensions[0] << ":"
          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
          << outType.dimensions[3];
@@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {

  // Add Stride
  int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
  NeuronModel_setOperandValue(
      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
  NeuronModel_setOperandValue(
      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);

  // Add filter
  int32_t filter_val[1];
-  filter_val[0] = global_pooling ? x_dims[3] : ksize[1];  // width
+  filter_val[0] =
+      global_pooling ? x_dims[3] : ksize[1];  // Entry 1: filter width
  NeuronModel_setOperandValue(
      model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
-  filter_val[0] = global_pooling ? x_dims[2] : ksize[0];  // height
+  filter_val[0] =
+      global_pooling ? x_dims[2] : ksize[0];  // Entry 0: filter height
  NeuronModel_setOperandValue(
      model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);


--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  xType.dimensions = &dims_x[0];
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
-    // input operand already exist
    x_node = graph->Get(x_name);
    VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
  } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: input
+    NeuronModel_addOperand(model, &xType);  // Operand 0: input
    x_node = graph->Add(x_name, dims_x);
  }
  VLOG(3) << "input_scale size: " << input_scale
@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType betaType;
  betaType.type = NEURON_FLOAT32;
  betaType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &betaType);  // 1: beta
+  NeuronModel_addOperand(model, &betaType);  // Operand 1: beta
  std::shared_ptr<Node> beta_node = nullptr;
  beta_node = graph->Add(x_name + "_beta", dims_int32);

@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType axisType;
  axisType.type = NEURON_INT32;
  axisType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &axisType);  // 2: axis
+  NeuronModel_addOperand(model, &axisType);  // Operand 2: axis
  std::shared_ptr<Node> axis_node = nullptr;
  axis_node = graph->Add(x_name + "_axis", dims_int32);

@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  outType.zeroPoint = 128;
  outType.dimensionCount = x_dims.size();
  outType.dimensions = &dims_x[0];
-  NeuronModel_addOperand(model, &outType);  // 3: output
+  NeuronModel_addOperand(model, &outType);  // Operand 3: output
  std::shared_ptr<Node> out_node = nullptr;
  out_node = graph->Add(out_name, dims_x);
  VLOG(3) << "out_scale: " << out_scale;
@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  axis_val[0] = axis;
  NeuronModel_setOperandValue(
      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
-  std::vector<uint32_t> addInIndex = {
-      x_node->index(), beta_node->index(), axis_node->index()};
+  std::vector<uint32_t> addInIndex = {x_node->index(),      // 0: input
+                                      beta_node->index(),   // 1: beta
+                                      axis_node->index()};  // 2: axis
  std::vector<uint32_t> addOutIndex = {out_node->index()};
  int neuron_errCode = NeuronModel_addOperation(model,
                                                NEURON_SOFTMAX,

--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
@@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info,
  }
 }

-void insert_transpose_node(void* ctx,
-                           const std::string& input_name,
-                           const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
-                           float scale,
-                           int32_t zeroPoint) {
+int insert_requant_node(void* ctx,
+                        const std::string& input_name,
+                        const std::string& output_name,
+                        std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> output_shape,
+                        float scale_in,
+                        float scale_out,
+                        int32_t zeroPoint) {
  int neuron_errCode;
  auto graph = static_cast<Graph*>(ctx);
  auto model = graph->model();

+  uint32_t numDevices = 0;
+  CHECK_EQ(Neuron_getDeviceCount(&numDevices), NEURON_NO_ERROR);
+  CHECK_GT(numDevices, (uint32_t)0);
+
+  NeuronDevice* targetDevice = nullptr;
+
+  for (uint32_t i = 0; i < numDevices; ++i) {
+    NeuronDevice* device = nullptr;
+    Neuron_getDevice(i, &device);
+    const char* name;
+    NeuronDevice_getName(device, &name);
+    if (0 == strcmp(name, "mtk-dsp")) {
+      targetDevice = device;
+      break;
+    }
+  }
+  if (targetDevice == nullptr) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+
  // Add input
  NeuronOperandType inType;
  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  inType.scale = scale;
+  inType.scale = scale_in;
  inType.zeroPoint = zeroPoint;
  inType.dimensionCount = input_shape.size();
  inType.dimensions = &input_shape[0];
@@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx,
    VLOG(3) << "Has " << input_name;
    input_node = graph->Get(input_name);
  } else {
-    neuron_errCode = NeuronModel_addOperand(model, &inType);  // input
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
    if (NEURON_NO_ERROR != neuron_errCode) {
-      LOG(WARNING) << "Insert transpose op fail!";
-      return;
+      LOG(FATAL) << "Insert mtk_requant op fail!";
+      return -1;
    }
    VLOG(3) << "Add " << input_name;
    input_node = graph->Add(input_name, input_shape);
  }

+  // Add output
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = scale_out;
+  outType.zeroPoint = zeroPoint;
+  outType.dimensionCount = output_shape.size();
+  outType.dimensions = &output_shape[0];
+
+  NeuronModel_addOperand(model, &outType);
+  std::shared_ptr<Node> output_node = nullptr;
+  output_node = graph->Add(output_name, output_shape);
+
+  std::vector<uint32_t> addInIndex = {input_node->index()};
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+
+  neuron_errCode = NeuronModel_addOperationExtension(model,
+                                                     "MTK_REQUANTIZE",
+                                                     "mediatek",
+                                                     targetDevice,
+                                                     addInIndex.size(),
+                                                     &addInIndex[0],
+                                                     addOutIndex.size(),
+                                                     &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+
+  return 0;
+}
+
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint) {
+  int neuron_errCode;
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+
+  // Add input
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = scale;
+  inType.zeroPoint = zeroPoint;
+  inType.dimensionCount = input_shape.size();
+  inType.dimensions = &input_shape[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(5) << "Has " << input_name;
+    input_node = graph->Get(input_name);
+  } else {
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(FATAL) << "Insert transpose op fail!";
+      return -1;
+    }
+    VLOG(5) << "Add " << input_name;
+    input_node = graph->Add(input_name, input_shape);
+  }
+
  // Add perm
  NeuronOperandType permsType;
  permsType.type = NEURON_TENSOR_INT32;
@@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx,
  uint32_t dims_perms[1] = {4};
  permsType.dimensions = dims_perms;

-  neuron_errCode = NeuronModel_addOperand(model, &permsType);  // perm
+  neuron_errCode = NeuronModel_addOperand(model, &permsType);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
-    return;
+    LOG(FATAL) << "Insert transpose op fail!";
+    return -1;
  }
  std::shared_ptr<Node> perms_node = nullptr;
  perms_node = graph->Add(input_name + "_perms", {4});

-  VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
+  VLOG(5) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
          << axis[3];
-  //  &axis[0], sizeof(int32_t) * axis.size());
+
  neuron_errCode = NeuronModel_setOperandValue(
      model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
-    return;
+    LOG(FATAL) << "Insert transpose op fail!";
+    return -1;
  }

  // Add output
@@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx,
  outType.dimensionCount = output_shape.size();
  outType.dimensions = &output_shape[0];

-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
  std::shared_ptr<Node> output_node = nullptr;
  output_node = graph->Add(output_name, output_shape);

@@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx,
                                            &addOutIndex[0]);

  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
+    LOG(FATAL) << "Insert transpose op fail!";
  }
+
+  return 0;
 }

 void transpose(const int8_t* input_data,
@@ -135,9 +224,9 @@ void transpose(const int8_t* input_data,
  int new_index = -1;
  int dim[4] = {0};
  std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
          << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data,
  int new_index = -1;
  int dim[4] = {0};
  std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
          << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data,
              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
-
-          output_data[new_index] = input_data[old_index] + 128;  // per layer
+          // Per layer op is asym op and need to add 128
+          output_data[new_index] = input_data[old_index] + 128;
        }
      }
    }

--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
@@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info,
                 const Scope* scope,
                 const std::string& argname);

-void insert_transpose_node(void* ctx,
-                           const std::string& input_name,
-                           const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
-                           float scale,
-                           int32_t zeroPoint);
+int insert_requant_node(void* ctx,
+                        const std::string& input_name,
+                        const std::string& output_name,
+                        std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> output_shape,
+                        float scale_in,
+                        float scale_out,
+                        int32_t zeroPoint);
+
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint);

 void transpose(const int8_t* input_data,
               uint8_t* output_data,

--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
    BuildOriginProgram();
  }

+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+
+  auto start_time = GetCurrentUS();
+
  unsigned int version;
  Neuron_getVersion(&version);
  VLOG(3) << "Neuron Adapter version: " << version;
@@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() {
  }
  VLOG(3) << "[APU] APU NIR model created!";

-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
+  VLOG(1) << "[APU] APU NIR model created, Create cost "
+          << GetCurrentUS() - start_time << " us";
+
+  start_time = GetCurrentUS();
  compilation_ = lite::apu::Device::Global().Build(model_);
  if (compilation_ == nullptr) {
    LOG(WARNING) << "[APU] Build APU DLA model failed!";
    return false;
  }
-  VLOG(3) << "[APU] APU DLA model created, Build cost "
+  VLOG(1) << "[APU] APU DLA model created, Build cost "
          << GetCurrentUS() - start_time << " us";
  return true;
 }
@@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() {
    }
  }
  NeuronExecution_free(run);
-  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
+  VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
  return true;
 }


--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -26,6 +26,88 @@ namespace lite {
 namespace kernels {
 namespace arm {

+template <typename Dtype>
+void naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  int k = 0;
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < m; ++j) {
+      dout[k++] = din[j * n + i];
+    }
+  }
+}
+
+template <PrecisionType PType>
+void fc_trans_weights(const Tensor& tin, Tensor* tout);
+
+template <>
+void fc_trans_weights<PRECISION(kFloat)>(const Tensor& tin, Tensor* tout) {
+  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
+  int m = tin.dims()[0];
+  int n = tin.dims()[1];
+  tout->Resize({n, m});
+  auto* ptr_in = tin.data<float>();
+  auto* ptr_out = tout->mutable_data<float>();
+  naive_transpose(ptr_in, ptr_out, m, n);
+}
+
+template <>
+void fc_trans_weights<PRECISION(kInt8)>(const Tensor& tin, Tensor* tout) {
+  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
+  int m = tin.dims()[0];
+  int n = tin.dims()[1];
+  tout->Resize({n, m});
+  auto* ptr_in = tin.data<int8_t>();
+  auto* ptr_out = tout->mutable_data<int8_t>();
+  naive_transpose(ptr_in, ptr_out, m, n);
+}
+
+template <PrecisionType PType, PrecisionType OutType>
+bool check_fc_use_gemm(int m, const std::vector<float>& scale, bool has_bias) {
+  return m > 1;
+}
+
+template <>
+bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kFloat)>(
+    int m, const std::vector<float>& scale, bool has_bias) {
+  CHECK_GT(scale.size(), 0) << "Int8 FC param must has weight_scale";
+  return m > 1 && scale.size() == 1;
+}
+
+template <>
+bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kInt8)>(
+    int m, const std::vector<float>& scale, bool has_bias) {
+  CHECK_GT(scale.size(), 0) << "Int8 FC param must has weight_scale";
+  return m > 1 && scale.size() == 1 && !has_bias;
+}
+
+template <PrecisionType PType, PrecisionType OutType>
+void FcCompute<PType, OutType>::ReInitWhenNeeded() {
+  auto& param = this->template Param<operators::FcParam>();
+  auto x_dims = param.input->dims();
+  if (last_shape_ == x_dims) {
+    return;
+  }
+  last_shape_ = x_dims;
+  auto w_dims = param.w->dims();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  CHECK_GE(x_dims.size(), 2UL);
+  CHECK_EQ(w_dims.size(), 2UL);
+  CHECK_GE(param.output->dims().size(), 2UL);
+
+  m_ = x_dims.Slice(0, param.in_num_col_dims).production();
+  k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
+  CHECK_EQ(k_, w_dims[0]);
+  n_ = w_dims[1];
+  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+  flag_gemm_ = check_fc_use_gemm<PType, OutType>(
+      m_, param.weight_scale, param.bias != nullptr);
+  if (!flag_trans_weights_ && !flag_gemm_) {
+    flag_trans_weights_ = true;
+    fc_trans_weights<PType>(*param.w, &weights_);
+  }
+}
+
 ///  for fp32 kernel
 template <>
 void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
@@ -71,8 +153,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  /// update bias
  if (param.bias) {
    bias_.Resize(param.bias->dims());
-    auto ptr = bias_.mutable_data<float>();
-    auto ptr_in = bias_.data<float>();
+    auto* ptr = bias_.mutable_data<float>();
+    auto* ptr_in = bias_.data<float>();
    float out_scale = param.output_scale;
    for (int i = 0; i < bias_.numel(); ++i) {
      ptr[i] = ptr_in[i] / out_scale;
@@ -86,9 +168,9 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<operators::FcParam>();
  auto& ctx = this->ctx_->template As<ARMContext>();

-  auto i_data = param.input->data<float>();
-  auto o_data = param.output->mutable_data<float>();
-  auto w_data = param.w->data<float>();
+  auto* i_data = param.input->data<float>();
+  auto* o_data = param.output->mutable_data<float>();
+  auto* w_data = flag_gemm_ ? param.w->data<float>() : weights_.data<float>();
  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
  if (flag_trans_bias_) {
    b_data = bias_.data<float>();
@@ -125,8 +207,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
    }
  } else {
    for (int i = 0; i < m_; ++i) {
-      auto i_data_batch = i_data + i * k_;
-      auto o_data_batch = o_data + i * n_;
+      auto* i_data_batch = i_data + i * k_;
+      auto* o_data_batch = o_data + i * n_;
      lite::arm::math::sgemv(w_data,
                             i_data_batch,
                             o_data_batch,
@@ -147,9 +229,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<operators::FcParam>();
  auto& ctx = this->ctx_->template As<ARMContext>();

-  auto i_data = param.input->data<int8_t>();
-  auto o_data = param.output->mutable_data<float>();
-  auto w_data = param.w->data<int8_t>();
+  auto* i_data = param.input->data<int8_t>();
+  auto* o_data = param.output->mutable_data<float>();
+  auto* w_data =
+      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
  if (flag_trans_bias_) {
    b_data = bias_.data<float>();
@@ -182,8 +265,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
    }
  } else {
    for (int i = 0; i < m_; ++i) {
-      auto i_data_batch = i_data + i * k_;
-      auto o_data_batch = o_data + i * n_;
+      auto* i_data_batch = i_data + i * k_;
+      auto* o_data_batch = o_data + i * n_;
      lite::arm::math::gemv_int8(w_data,
                                 i_data_batch,
                                 o_data_batch,
@@ -205,9 +288,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<operators::FcParam>();
  auto& ctx = this->ctx_->template As<ARMContext>();

-  auto i_data = param.input->data<int8_t>();
-  auto o_data = param.output->mutable_data<int8_t>();
-  auto w_data = param.w->data<int8_t>();
+  auto* i_data = param.input->data<int8_t>();
+  auto* o_data = param.output->mutable_data<int8_t>();
+  auto* w_data =
+      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
  const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
  if (flag_trans_bias_) {
    b_data = bias_.data<float>();
@@ -240,8 +324,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                             &ctx);
  } else {
    for (int i = 0; i < m_; ++i) {
-      auto i_data_batch = i_data + i * k_;
-      auto o_data_batch = o_data + i * n_;
+      auto* i_data_batch = i_data + i * k_;
+      auto* o_data_batch = o_data + i * n_;
      lite::arm::math::gemv_int8(w_data,
                                 i_data_batch,
                                 o_data_batch,

--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -24,92 +24,12 @@ namespace lite {
 namespace kernels {
 namespace arm {

-template <typename Dtype>
-void naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
-  int k = 0;
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < m; ++j) {
-      dout[k++] = din[j * n + i];
-    }
-  }
-}
-
-template <PrecisionType PType>
-void fc_trans_weights(const Tensor& tin, Tensor* tout);
-
-template <>
-void fc_trans_weights<PRECISION(kFloat)>(const Tensor& tin, Tensor* tout) {
-  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
-  int m = tin.dims()[0];
-  int n = tin.dims()[1];
-  tout->Resize({n, m});
-  auto ptr_in = tin.data<float>();
-  auto ptr_out = tout->mutable_data<float>();
-  naive_transpose(ptr_in, ptr_out, m, n);
-}
-
-template <>
-void fc_trans_weights<PRECISION(kInt8)>(const Tensor& tin, Tensor* tout) {
-  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
-  int m = tin.dims()[0];
-  int n = tin.dims()[1];
-  tout->Resize({n, m});
-  auto ptr_in = tin.data<int8_t>();
-  auto ptr_out = tout->mutable_data<int8_t>();
-  naive_transpose(ptr_in, ptr_out, m, n);
-}
-
-template <PrecisionType PType, PrecisionType OutType>
-bool check_fc_use_gemm(int m, const std::vector<float>& scale, bool has_bias) {
-  return m > 1;
-}
-
-template <>
-bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kFloat)>(
-    int m, const std::vector<float>& scale, bool has_bias) {
-  CHECK(scale.size() > 0) << "Int8 FC param must has weight_scale";
-  return m > 1 && scale.size() == 1;
-}
-
-template <>
-bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kInt8)>(
-    int m, const std::vector<float>& scale, bool has_bias) {
-  CHECK(scale.size() > 0) << "Int8 FC param must has weight_scale";
-  return m > 1 && scale.size() == 1 && !has_bias;
-}
-
 template <PrecisionType PType, PrecisionType OutType>
 class FcCompute : public KernelLite<TARGET(kARM), PType> {
 public:
  using param_t = operators::FcParam;

-  virtual void ReInitWhenNeeded() {
-    auto& param = this->template Param<operators::FcParam>();
-    auto x_dims = param.input->dims();
-    if (last_shape_ == x_dims) {
-      return;
-    }
-    last_shape_ = x_dims;
-    auto w_dims = param.w_dims;
-    auto& ctx = this->ctx_->template As<ARMContext>();
-
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_EQ(w_dims.size(), 2UL);
-    CHECK_GE(param.output->dims().size(), 2UL);
-
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    flag_gemm_ = check_fc_use_gemm<PType, OutType>(
-        m_, param.weight_scale, param.bias != nullptr);
-    if (flag_trans_weights_ == flag_gemm_) {
-      flag_trans_weights_ = !flag_trans_weights_;
-      Tensor tmp_tensor;
-      fc_trans_weights<PType>(*param.w, &tmp_tensor);
-      param.w->CopyDataFrom(tmp_tensor);
-    }
-  }
-
+  virtual void ReInitWhenNeeded();
  virtual void PrepareForRun();
  virtual void Run();

@@ -117,6 +37,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {

 private:
  DDim last_shape_;
+  Tensor weights_;
  Tensor bias_;
  bool flag_trans_weights_{false};
  bool flag_trans_bias_{false};

--- a/lite/kernels/host/crf_decoding_compute.h
+++ b/lite/kernels/host/crf_decoding_compute.h
@@ -52,7 +52,7 @@ void Decode(const Tensor& emission_weights,

  for (int k = 1; k < seq_len; ++k) {
    for (int i = 0; i < tag_num; ++i) {
-      T max_score = -std::numeric_limits<T>::max();
+      T max_score = -(std::numeric_limits<T>::max)();
      int max_j = 0;
      for (size_t j = 0; j < tag_num; ++j) {
        T score = alpha_value[(k - 1) * tag_num + j] +
@@ -67,7 +67,7 @@ void Decode(const Tensor& emission_weights,
    }
  }

-  T max_score = -std::numeric_limits<T>::max();
+  T max_score = -(std::numeric_limits<T>::max)();
  int max_i = 0;
  for (size_t i = 0; i < tag_num; ++i) {
    T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];

--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -72,10 +72,10 @@ static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
      box2[3] < box1[1]) {
    return static_cast<T>(0.);
  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_xmin = (std::max)(box1[0], box2[0]);
+    const T inter_ymin = (std::max)(box1[1], box2[1]);
+    const T inter_xmax = (std::min)(box1[2], box2[2]);
+    const T inter_ymax = (std::min)(box1[3], box2[3]);
    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
    T inter_w = inter_xmax - inter_xmin + norm;
    T inter_h = inter_ymax - inter_ymin + norm;

--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
@@ -128,7 +128,7 @@ class TensorFormatter {
  void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) {
    int64_t print_size = summarize_ == -1
                             ? print_tensor.numel()
-                             : std::min(summarize_, print_tensor.numel());
+                             : (std::min)(summarize_, print_tensor.numel());
    const T* data = print_tensor.data<T>();  // Always kHost, so unnessary to
                                             // copy the data from device
    log_stream << "  - data: [";

--- a/lite/kernels/host/retinanet_detection_output_compute.cc
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
@@ -83,10 +83,10 @@ static inline T JaccardOverlap(const std::vector<T>& box1,
      box2[3] < box1[1]) {
    return static_cast<T>(0.);
  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_xmin = (std::max)(box1[0], box2[0]);
+    const T inter_ymin = (std::max)(box1[1], box2[1]);
+    const T inter_xmax = (std::min)(box1[2], box2[2]);
+    const T inter_ymax = (std::min)(box1[3], box2[3]);
    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
    T inter_w = inter_xmax - inter_xmin + norm;
    T inter_h = inter_ymax - inter_ymin + norm;
@@ -183,10 +183,10 @@ void DeltaScoreToPrediction(
    pred_box_xmax = pred_box_xmax / im_scale;
    pred_box_ymax = pred_box_ymax / im_scale;

-    pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
-    pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
-    pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
-    pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
+    pred_box_xmin = (std::max)((std::min)(pred_box_xmin, im_width - 1), zero);
+    pred_box_ymin = (std::max)((std::min)(pred_box_ymin, im_height - 1), zero);
+    pred_box_xmax = (std::max)((std::min)(pred_box_xmax, im_width - 1), zero);
+    pred_box_ymax = (std::max)((std::min)(pred_box_ymax, im_height - 1), zero);

    std::vector<T> one_pred;
    one_pred.push_back(pred_box_xmin);

--- a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cmath>
+
 #include "lite/core/subgraph_bridge_registry.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/utility.h"

--- a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cmath>
+
 #include "lite/core/subgraph_bridge_registry.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/utility.h"

--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -71,7 +71,7 @@ inline void get_mid_dims(const lite::DDim &x_dims,
        for (size_t j = 0; j < i; ++j) {
          (*pre) *= y_dims[j];
        }
-        *n = std::max(x_dims[i + axis], y_dims[i]);
+        *n = (std::max)(x_dims[i + axis], y_dims[i]);
        *mid_flag = 1;
        mid = i;
        break;

--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -55,7 +55,7 @@ class SequenceArithmeticCompute
        auto input_x = x_data + x_seq_offset[i] * inner_size;
        auto input_y = y_data + y_seq_offset[i] * inner_size;
        auto t_out = out_data + x_seq_offset[i] * inner_size;
-        int len = std::min(len_x, len_y);
+        int len = (std::min)(len_x, len_y);
        for (int j = 0; j < len; j++) {
          t_out[j] = input_x[j] + input_y[j];
        }
@@ -73,7 +73,7 @@ class SequenceArithmeticCompute
        auto input_x = x_data + x_seq_offset[i] * inner_size;
        auto input_y = y_data + y_seq_offset[i] * inner_size;
        auto t_out = out_data + x_seq_offset[i] * inner_size;
-        int len = std::min(len_x, len_y);
+        int len = (std::min)(len_x, len_y);
        for (int j = 0; j < len; j++) {
          t_out[j] = input_x[j] - input_y[j];
        }
@@ -91,7 +91,7 @@ class SequenceArithmeticCompute
        auto input_x = x_data + x_seq_offset[i] * inner_size;
        auto input_y = y_data + y_seq_offset[i] * inner_size;
        auto t_out = out_data + x_seq_offset[i] * inner_size;
-        int len = std::min(len_x, len_y);
+        int len = (std::min)(len_x, len_y);
        for (int j = 0; j < len; j++) {
          t_out[j] = input_x[j] * input_y[j];
        }

--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
@@ -49,8 +49,8 @@ class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    bool padding_trainable = false;
    const Tensor* padding_data = nullptr;

-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
+    int up_pad = (std::max)(0, -context_start);
+    int down_pad = (std::max)(0, context_start + context_length - 1);
    auto sequence_width = static_cast<int64_t>(in->dims()[1]);

    std::vector<int64_t> col_shape{in->dims()[0],

--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
@@ -102,9 +102,9 @@ void slice_compute(const lite::Tensor* in,

        start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
        end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim_value);
+        start = (std::max)(start, 0);
+        end = (std::max)(end, 0);
+        end = (std::min)(end, dim_value);
        CHECK_GT(end, start) << "end should greater than start";
        out_dims[axes[i]] = end - start;
      }
@@ -172,7 +172,7 @@ void slice_compute(const lite::Tensor* in,
    if (start < 0) {
      start = (start + in_dims[axes[i]]);
    }
-    start = std::max(start, 0);
+    start = (std::max)(start, 0);
    offsets[axes[i]] = start;
  }
  auto in_t =

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -391,7 +391,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
  }
  {  // the 3rd field, tensor data
    uint64_t size = tensor.memory_size();
-    CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
+    CHECK_LT(size, (std::numeric_limits<std::streamsize>::max)())
        << "Index overflow when writing tensor";

 #ifdef LITE_WITH_CUDA
@@ -461,7 +461,7 @@ void SetParamInfoNaive(naive_buffer::ParamDesc *param_desc,
  }
  desc.SetDim(tensor.dims().Vectorize());
  uint64_t size = tensor.memory_size();
-  CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
+  CHECK_LT(size, (std::numeric_limits<std::streamsize>::max)())
      << "Index overflow when writing tensor";

 #ifdef LITE_WITH_CUDA

--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -62,7 +62,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
  if (padding_algorithm == "SAME") {
    for (size_t i = 0; i < strides.size(); ++i) {
      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum = std::max(
+      int pad_sum = (std::max)(
          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
          (int64_t)0);
      int pad_0 = pad_sum / 2;

--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -75,7 +75,7 @@ bool ElementwiseOp::InferShapeImpl() const {
      if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
        out_dims_array[i] = -1;
      } else {
-        out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]);
      }
    }
    param_.Out->Resize(DDim(out_dims_array));

--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -128,8 +128,8 @@ inline void UpdatePadding(std::vector<int> *paddings,
    for (size_t i = 0; i < strides.size(); ++i) {
      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
      int pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
-                   (int64_t)0);
+          (std::max)((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                     (int64_t)0);
      int pad_0 = pad_sum / 2;
      int pad_1 = pad_sum - pad_0;
      *(paddings->begin() + i * 2) = pad_0;

--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -51,9 +51,9 @@ bool SliceOp::InferShapeImpl() const {
      if (dim_value > 0) {
        start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
        end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim_value);
+        start = (std::max)(start, 0);
+        end = (std::max)(end, 0);
+        end = (std::min)(end, dim_value);
        out_dims[axes[i]] = end - start;
      }
    }

--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(math)
 add_subdirectory(cv)
 add_subdirectory(cv/anakin)
 add_subdirectory(api)
+add_subdirectory(benchmark)
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -9,11 +9,18 @@ if(LITE_WITH_ARM)
 endif()

 function(xpu_x86_without_xtcl_test TARGET MODEL DATA)
-    lite_cc_test(${TARGET} SRCS ${TARGET}.cc
-        DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-        ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-        ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL}
-             --data_dir=${LITE_MODEL_DIR}/${DATA})
+    if(${DATA} STREQUAL "")
+        lite_cc_test(${TARGET} SRCS ${TARGET}.cc
+            DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+            ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+            ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL})
+    else()
+        lite_cc_test(${TARGET} SRCS ${TARGET}.cc
+            DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+            ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+            ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL} --data_dir=${LITE_MODEL_DIR}/${DATA})
+    endif()
+
    if(WITH_TESTING)
        add_dependencies(${TARGET} extern_lite_download_${MODEL}_tar_gz)
        if(NOT ${DATA} STREQUAL "")
@@ -26,8 +33,8 @@ if(LITE_WITH_XPU AND NOT LITE_WITH_XTCL)
    xpu_x86_without_xtcl_test(test_resnet50_fp32_xpu resnet50 ILSVRC2012_small)
    xpu_x86_without_xtcl_test(test_googlenet_fp32_xpu GoogLeNet ILSVRC2012_small)
    xpu_x86_without_xtcl_test(test_vgg19_fp32_xpu VGG19 ILSVRC2012_small)
-    xpu_x86_without_xtcl_test(test_ernie_fp32_xpu ernie "")
-    xpu_x86_without_xtcl_test(test_bert_fp32_xpu bert "")
+    xpu_x86_without_xtcl_test(test_ernie_fp32_xpu ernie bert_data)
+    xpu_x86_without_xtcl_test(test_bert_fp32_xpu bert bert_data)
 endif()

 if(LITE_WITH_RKNPU)

--- a/lite/tests/api/bert_utility.h
+++ b/lite/tests/api/bert_utility.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+
+template <class T = int64_t>
+void ReadRawData(const std::string& input_data_dir,
+                 std::vector<std::vector<T>>* input0,
+                 std::vector<std::vector<T>>* input1,
+                 std::vector<std::vector<T>>* input2,
+                 std::vector<std::vector<T>>* input3,
+                 std::vector<std::vector<int64_t>>* input_shapes) {
+  auto lines = ReadLines(input_data_dir);
+  for (auto line : lines) {
+    std::vector<std::string> shape_and_data = Split(line, ";");
+    std::vector<int64_t> input_shape =
+        Split<int64_t>(Split(shape_and_data[0], ":")[0], " ");
+    input_shapes->emplace_back(input_shape);
+
+    std::vector<T> input0_data =
+        Split<T>(Split(shape_and_data[0], ":")[1], " ");
+    input0->emplace_back(input0_data);
+    std::vector<T> input1_data =
+        Split<T>(Split(shape_and_data[1], ":")[1], " ");
+    input1->emplace_back(input1_data);
+    std::vector<T> input2_data =
+        Split<T>(Split(shape_and_data[2], ":")[1], " ");
+    input2->emplace_back(input2_data);
+    std::vector<T> input3_data =
+        Split<T>(Split(shape_and_data[3], ":")[1], " ");
+    input3->emplace_back(input3_data);
+  }
+}
+
+template <class T = int64_t>
+void FillTensor(const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
+                int tensor_id,
+                const std::vector<int64_t>& tensor_shape,
+                const std::vector<T>& tensor_value) {
+  predictor->GetInput(tensor_id)->Resize(tensor_shape);
+  int64_t tensor_size = 1;
+  for (size_t i = 0; i < tensor_shape.size(); i++) {
+    tensor_size *= tensor_shape[i];
+  }
+  CHECK_EQ(static_cast<size_t>(tensor_size), tensor_value.size());
+  memcpy(predictor->GetInput(tensor_id)->mutable_data<T>(),
+         tensor_value.data(),
+         sizeof(T) * tensor_size);
+}
+
+float CalBertOutAccuracy(const std::vector<std::vector<float>>& out,
+                         const std::string& out_file) {
+  auto lines = ReadLines(out_file);
+  std::vector<std::vector<float>> ref_out;
+  for (auto line : lines) {
+    ref_out.emplace_back(Split<float>(line, " "));
+  }
+
+  int right_num = 0;
+  for (size_t i = 0; i < out.size(); i++) {
+    std::vector<size_t> out_index{0, 1, 2};
+    std::vector<size_t> ref_out_index{0, 1, 2};
+
+    std::sort(out_index.begin(),
+              out_index.end(),
+              [&out, i](size_t a, size_t b) { return out[i][a] > out[i][b]; });
+    std::sort(ref_out_index.begin(),
+              ref_out_index.end(),
+              [&ref_out, i](size_t a, size_t b) {
+                return ref_out[i][a] > ref_out[i][b];
+              });
+    right_num += (out_index == ref_out_index);
+  }
+
+  return static_cast<float>(right_num) / static_cast<float>(out.size());
+}
+
+float CalErnieOutAccuracy(const std::vector<std::vector<float>>& out,
+                          const std::string& out_file) {
+  auto lines = ReadLines(out_file);
+  std::vector<std::vector<float>> ref_out;
+  for (auto line : lines) {
+    ref_out.emplace_back(Split<float>(line, " "));
+  }
+
+  int right_num = 0;
+  for (size_t i = 0; i < out.size(); i++) {
+    right_num += (std::fabs(out[i][0] - ref_out[i][0]) < 0.01f);
+  }
+
+  return static_cast<float>(right_num) / static_cast<float>(out.size());
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_bert_fp32_xpu.cc
+++ b/lite/tests/api/test_bert_fp32_xpu.cc
@@ -21,23 +21,16 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/bert_utility.h"
 #include "lite/utils/cp_logging.h"

+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 9, "iteration times to run");
+
 namespace paddle {
 namespace lite {

-template <typename T>
-lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
-  lite::Tensor ret;
-  ret.Resize(shape);
-  T* ptr = ret.mutable_data<T>();
-  for (int i = 0; i < ret.numel(); ++i) {
-    ptr[i] = (T)1;
-  }
-  return ret;
-}
-
-TEST(Ernie, test_ernie_fp32_xpu) {
+TEST(Bert, test_bert_fp32_xpu) {
  lite_api::CxxConfig config;
  config.set_model_dir(FLAGS_model_dir);
  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
@@ -46,56 +39,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
  config.set_xpu_workspace_l3_size_per_thread();
  auto predictor = lite_api::CreatePaddlePredictor(config);

-  int64_t batch_size = 1;
-  int64_t seq_len = 64;
-  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
-  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
-  predictor->GetInput(0)->Resize(input_shape);
-  predictor->GetInput(1)->Resize(input_shape);
-  predictor->GetInput(2)->Resize(input_shape);
-  predictor->GetInput(3)->Resize(input_shape);
-
-  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
+  std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
+  std::vector<std::vector<int64_t>> input0;
+  std::vector<std::vector<int64_t>> input1;
+  std::vector<std::vector<int64_t>> input2;
+  std::vector<std::vector<int64_t>> input3;
+  std::vector<std::vector<int64_t>> input_shapes;
+  ReadRawData(
+      input_data_file, &input0, &input1, &input2, &input3, &input_shapes);

  for (int i = 0; i < FLAGS_warmup; ++i) {
+    std::vector<int64_t> shape = {1, 64, 1};
+    std::vector<int64_t> fill_value(64, 0);
+    for (int j = 0; j < 4; j++) {
+      FillTensor(predictor, j, shape, fill_value);
+    }
    predictor->Run();
  }

-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (int i = 0; i < FLAGS_iteration; ++i) {
+    FillTensor(predictor, 0, input_shapes[i], input0[i]);
+    FillTensor(predictor, 1, input_shapes[i], input1[i]);
+    FillTensor(predictor, 2, input_shapes[i], input2[i]);
+    FillTensor(predictor, 3, input_shapes[i], input3[i]);
+
+    double start = GetCurrentUS();
    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 3);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
  }

  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
+            << ", warmup: " << FLAGS_warmup
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";

-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 3);
-
-  for (size_t i = 0; i < results.size(); ++i) {
-    for (size_t j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 3e-5);
-    }
-  }
+  std::string ref_out_file = FLAGS_data_dir + std::string("/bert_out.txt");
+  float out_accuracy = CalBertOutAccuracy(out_rets, ref_out_file);
+  ASSERT_GT(out_accuracy, 0.95f);
 }

 }  // namespace lite

--- a/lite/tests/api/test_ernie_fp32_xpu.cc
+++ b/lite/tests/api/test_ernie_fp32_xpu.cc
@@ -21,8 +21,12 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/bert_utility.h"
 #include "lite/utils/cp_logging.h"

+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 9, "iteration times to run");
+
 namespace paddle {
 namespace lite {

@@ -46,56 +50,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
  config.set_xpu_workspace_l3_size_per_thread();
  auto predictor = lite_api::CreatePaddlePredictor(config);

-  int64_t batch_size = 1;
-  int64_t seq_len = 64;
-  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
-  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
-  predictor->GetInput(0)->Resize(input_shape);
-  predictor->GetInput(1)->Resize(input_shape);
-  predictor->GetInput(2)->Resize(input_shape);
-  predictor->GetInput(3)->Resize(input_shape);
-
-  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
+  std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
+  std::vector<std::vector<int64_t>> input0;
+  std::vector<std::vector<int64_t>> input1;
+  std::vector<std::vector<int64_t>> input2;
+  std::vector<std::vector<int64_t>> input3;
+  std::vector<std::vector<int64_t>> input_shapes;
+  ReadRawData(
+      input_data_file, &input0, &input1, &input2, &input3, &input_shapes);

  for (int i = 0; i < FLAGS_warmup; ++i) {
+    std::vector<int64_t> shape = {1, 64, 1};
+    std::vector<int64_t> fill_value(64, 0);
+    for (int j = 0; j < 4; j++) {
+      FillTensor(predictor, j, shape, fill_value);
+    }
    predictor->Run();
  }

-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (int i = 0; i < FLAGS_iteration; ++i) {
+    FillTensor(predictor, 0, input_shapes[i], input0[i]);
+    FillTensor(predictor, 1, input_shapes[i], input1[i]);
+    FillTensor(predictor, 2, input_shapes[i], input2[i]);
+    FillTensor(predictor, 3, input_shapes[i], input3[i]);
+
+    double start = GetCurrentUS();
    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
  }

  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>({0.108398}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 1);
+            << ", warmup: " << FLAGS_warmup
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";

-  for (size_t i = 0; i < results.size(); ++i) {
-    for (size_t j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 2e-5);
-    }
-  }
+  std::string ref_out_file = FLAGS_data_dir + std::string("/ernie_out.txt");
+  float out_accuracy = CalErnieOutAccuracy(out_rets, ref_out_file);
+  ASSERT_GT(out_accuracy, 0.95f);
 }

 }  // namespace lite

--- a/lite/tests/benchmark/CMakeLists.txt
+++ b/lite/tests/benchmark/CMakeLists.txt
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU AND NOT LITE_WITH_XPU) AND (LITE_WITH_ARM))
+    lite_cc_test(get_conv_latency SRCS src/get_conv_latency.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_batchnorm_latency SRCS src/get_batchnorm_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_pooling_latency SRCS src/get_pooling_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_fc_latency SRCS src/get_fc_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_activation_latency SRCS src/get_activation_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+endif()
--- a/lite/tests/benchmark/README.md
+++ b/lite/tests/benchmark/README.md
+# 运行方式
+```shell
+-- cd Paddle-Lite/lite/tests/benchmark
+-- ./build_benchmark_ops.sh #把build目录下的所有单测可执行文件push到手机上
+在build_benchmark_ops.sh中运行python get_latency_lookup_table.py --ops_path ops.txt  --latency_lookup_table_path latency_lookup_table.txt
+其中ops.txt是输入的网络模型文件， latency_lookup_table.txt是执行lite单测后输出的网络op耗时信息文件。
+```
+# 输入ops.txt格式说明
+-- op_name  [dim0 dim1 dim2 dim3]   (op_param0, op_param1, ...， dtype=xxx)
+   ops.txt每一行有三个字段，第一个字段是op_name, 第二个字段是输入Tensor的input_dims,
+   第三个字段用()括起来，描述该op的parameter.
+   # 注意： 每一个字段之间是以tab来分割的，parameter内的子字段是以逗号来分割的，
+   # 描述tensor维度的[]内的数据之间以空格来分割，不能加逗号和tab.
+   op_name现支持取值为conv/activation/batchnorm/pooling/fc;
+   input_dims描述的是输入tensor格式，支持NCHW 4D等Tensor格式;
+   op_param0,op_param1等字段描述该op的param属性，比如conv op包含ch_out/stride/group/kernel/pad/dilation/flag_bias/flag_act等属性;
+   dtype描述该层op使用的数据类型，支持的合法输入为float/int8_float/int8_int8, 现在conv支持三种数据类型，其他op只支持float一种数据类型.
+   
+   # conv op格式
+   conv  [1 96 112 112] (ch_out=48, stride=1, group=1, kernel=1x1, pad=0, dilation=1, flag_bias=0, flag_act=0, dtype=float)
+   ch_out表示输出channel值， kernel表示卷积核size, 支持的合法取值为1x1/3x3/5x5等, pad表示边界padding的取值， flag_bias表示是否有bias, flag_act表示是否融合激活函数，支持的合法取值为0/1/2/4.
+   
+   # activitation op格式
+   activation  [1 8 64 64] (act_type=relu)
+   act_type表示激活函数类型，合法取值为relu/relu6/leaky_relu/tanh/swish/exp/abs/hard_swish/reciprocal/threshold_relu.
+
+   # batchnorm op格式
+   batchnorm   [1 8 64 64] (epsilon=1e-4f, momentum=0.9f)
+   epsilon表示batchnorm的epsilon参数取值， 默认值为1e-4f;
+   momentum表示batchnorm的momentum参数取值， 默认值为0.9f.
+
+   # pooling op格式
+   pooling  [1 8 64 64] (stride=2, pad=0, kernel=2x2, ceil_mode=0, flag_global=0, exclusive=1, pooling_type=max)
+   stride表示pooling操作的跨度，默认值取2;pad表示边界padding的取值，默认值取0;
+   kernel表示pooling卷积核size, 常见取值为2x2(默认值)；
+   ceil_mode表示pooling是否进行ceil操作，=0表示false(默认值)，否则表示为true;
+   flag_global表示pooling是否在WxH维度进行全局操作，=0表示false(默认值)，否则表示为true;
+   exclusive表示pooling操作时的exclusive取值，=1表示true(默认值)，否则表示为false;
+   pooling_type表示pooling类型，合法取值为max(默认值)/avg.
+
+   # fc op格式
+   fc [1 64]   (flag_bias=1, param_dim=64x1000)
+   flag_bias表示fc op是否有bias，=1(默认值)表示为true, 否则为false;
+   param_dim表示fc op `k x n`的操作维度信息，其中k应与input_dims=[m k]中的k取值保持一致.
+   
+# 输出latency_lookup_table.txt格式说明
+dev_info           core_num thread_num	power_mode	core0 arch	core1 arch	core2 arch	core3 arch	core4 arch	core5 arch	core6 arch	core7 arch
+Hisilicon Kirin980    8       1         	0         ARM_A55  	ARM_A55  	ARM_A55  	ARM_A55  	ARM_A76  	ARM_A76  	ARM_A76  	ARM_A76
+
+op_name   	input_dims	   output_dims	   param_info     min_latency(ms)	  max_latency(ms)	 avg_latency(ms)
+conv      	[1 96 112 112]	[1 48 114 114]	(ch_out=48, stride=1, pad=0, kernel=1x1, group=1, dilation=1, flag_bias=0, flag_act=0, dtype=float) 	3.469     	4.111     	3.52088   
+fc        	[1 64]   	[64 1000] 	(param_dim=64x1000, flag_bias=1, dtype=float)  0.135     	0.176     	0.13779   
+batchnorm 	[1 8 64 64]	[1 8 64 64]	(epsilon=1e-4f, momentum=0.9f, dtype=float)    0.014     	0.178     	0.01679   
+pooling   	[1 8 64 64]	[1 8 32 32]	(stride=2, pad=0, kernel=2x2, ceil_mode=0, flag_global=0, exclusive=0, pooling_type=avg, dtype=float) 	0.009     	0.011     	0.00983   
+activation	[1 8 64 64]	[1 8 64 64]	(act_type=relu, dtype=float)                   0.01      	0.036     	0.01103
+
+-- 第一栏为header信息栏， 包含`dev_info` `arm_v7/v8` `core_num` `thread_num` `power_mode` `core0 arch` ... `core7 arch`字段：
+   `dev_info`表示手机hardware厂家型号信息， `arm_v7/v8`表示armv7还是armv8架构, `core_num`表示cpu核心数， `thread_num`表示设置的运行多线程数，
+   `power_mode`表示cpu绑核方式，
+   `core0 arch`...`core7 arch`表示arm cpu架构信息
+   第二栏为op信息栏， 包含`op_name` `input_dims` `output_dims` `param_info` `min_latency` `max_latency` `avg_latency`字段：
+   其中`output_dims`为该层op根据`input_dims`和`param_info`计算得到的输出tensor维度信息;
+   `min_latency(ms)` `max_latency(ms)` `avg_latency(ms)`为该层op运行得到的min/max/avg耗时信息.
--- a/lite/tests/benchmark/build_benchmark_ops.sh
+++ b/lite/tests/benchmark/build_benchmark_ops.sh
+#!/usr/bin/env bash
+
+exe_dir="/data/local/tmp/bin"
+work_dir=$(pwd)
+os=android
+abi=armv8
+lang=gcc
+
+function print_usage {
+    echo "----------------------------------------"
+    echo -e "   ./push2device.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang>"
+    echo -e "--arm_os:\t android, only support android now"
+    echo -e "--arm_abi:\t armv8|armv7"
+    echo -e "--arm_lang:\t gcc|clang"
+    echo -e "make sure directory: PaddleLite/build.lite.${arm_os}.${arm_abi}.${arm_lang} exsits!"
+    echo "----------------------------------------"
+}
+
+function main {
+    for i in "$@"; do
+        case $i in
+            --arm_os=*)
+                os="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                abi="${i#*=}"
+                shift
+                ;;
+            --arm_lang=*)
+                lang="${i#*=}"
+                shift
+                ;;
+            *)
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+
+    build_dir=$work_dir/../../../build.lite.${os}.${abi}.${lang}
+    lib_path=$build_dir/lite/tests/benchmark
+    lib_files=$lib_path/get*latency
+
+    adb shell mkdir ${exe_dir}
+    for file in ${lib_files}
+    do
+        adb push ${file} ${exe_dir}
+    done
+}
+
+main $@
+python get_latency_lookup_table.py --arm_v7_v8 ${abi}
--- a/lite/tests/benchmark/get_latency_lookup_table.py
+++ b/lite/tests/benchmark/get_latency_lookup_table.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import sys
+import re
+import argparse
+import subprocess
+
+def get_args():
+    """Get arguments.
+
+    Returns:
+        Namespace, arguments.
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--ops_path', default='ops.txt', help='Input ops path.')
+    parser.add_argument(
+        '--latency_lookup_table_path',
+        default='latency_lookup_table.txt',
+        help='Output ops latency path.')
+    parser.add_argument(
+        '--platform', default='android', help='Platform: android/ios/custom.')
+    parser.add_argument('--threads', type=int, default=1, help='Threads.')
+    parser.add_argument('--power_mode', type=int, default=0, help='PowerMode.')
+    parser.add_argument('--warmup_times', type=int, default=5, 
+        help='Warm up times of op when estimating latency.')
+    parser.add_argument('--repeats_times', type=int, default=100,
+        help='Running times of op when estimating latency.')
+    parser.add_argument('--arm_v7_v8', type=str, default='armv8',
+        help='Indicate arm architecture v7 or v8.')
+    args = parser.parse_args()
+    return args
+
+def check_dev_connect():
+    cmd = 'adb devices | grep device'
+    dev_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out = dev_info.communicate()[0]
+    res = out.decode().find("\tdevice")
+    if res == -1:
+        print("No android device is attached")
+        sys.exit()
+
+def get_dev_info():
+    cmd = 'adb shell "cat /proc/cpuinfo | grep Hardware"'
+    dev_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out = dev_info.communicate()[0]
+    out = out.decode().strip('\n')
+    dev_info = out.strip('Hardware\t:').strip()
+    cmd = 'adb shell "cat /proc/cpuinfo | grep part"'
+    cpu_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out = cpu_info.communicate()[0]
+    out = (out.decode().strip('\n').split('\n'))
+    core_num = len(out)
+    arch_type = ['UNKNOWN CPU ARCH']*core_num
+    for i, v in enumerate(out):
+        out = v.strip('CPU part').strip().strip(':').strip()
+        if out == '0xd03':
+            arch_type[i] = 'ARM_A53'
+        elif out == '0xd05':
+            arch_type[i] = 'ARM_A55'
+        elif out == '0xd07':
+            arch_type[i] = 'ARM_A57'
+        elif out == '0xd08':
+            arch_type[i] = 'ARM_A72'
+        elif out == '0xd09':
+            arch_type[i] = 'ARM_A73'
+        elif out == '0xd0a':
+            arch_type[i] = 'ARM_A75'
+        elif out == '0xd40':
+            arch_type[i] = 'ARM_A76'
+        elif out == '0x804':
+            # 855
+            arch_type[i] = 'ARM_A76'
+        elif out == '0x805':
+            # 855
+            arch_type[i] = 'ARM_A55'
+        elif out == '0x802':
+            # 845
+            arch_type[i] = 'ARM_A75'
+        elif out == '0x803':
+            # 845
+            arch_type[i] = 'ARM_A55'
+        elif out == '0x801':
+            # 835
+            arch_type[i] = 'ARM_A73'
+        elif out == '0x800':
+            # 835
+            arch_type[i] = 'ARM_A73'
+        elif out == '0x205':
+            # 820
+            arch_type[i] = 'ARM_A72'
+        else:
+            arch_type[i] = 'UNKNOWN CPU ARCH'
+    return dev_info, core_num, arch_type
+
+def get_op_latency(op, platform):
+    """Get model latency.
+
+    Args:
+        op: list, a list of str represents the op and its parameters.
+        platform: str, platform name.
+
+    Returns:
+        float, op latency.
+    """
+    if platform == 'android':
+        commands = 'adb shell "cd /data/local/tmp/bin && ./get_{}_latency {}"'.format(
+            op[0], ' '.join(op[1:]))
+        proc = subprocess.Popen(
+            commands,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True)
+        out = proc.communicate()[0]
+        avg_out = [_ for _ in out.decode().split('\n') if 'Avg Latency' in _][-1]
+        avg_out = re.findall(r'\d+\.?\d*', avg_out)[0]
+        avg_out = float(avg_out)
+        min_out = [_ for _ in out.decode().split('\n') if 'Min Latency' in _][-1]
+        min_out = re.findall(r'\d+\.?\d*', min_out)[0]
+        min_out = float(min_out)
+        max_out = [_ for _ in out.decode().split('\n') if 'Max Latency' in _][-1]
+        max_out = re.findall(r'\d+\.?\d*', max_out)[0]
+        max_out = float(max_out)
+    elif platform == 'ios':
+        print('ios platform is not supported now')
+        sys.exit()
+    else:
+        print('Please define `get_op_latency` for {} platform'.format(platform))
+        sys.exit()
+    return avg_out, min_out, max_out
+
+def main():
+    args = get_args()
+    check_dev_connect()
+    conv_param_dict = {'ch_out': '1', 'stride':'[1 1]', 'pad':'[0 0 0 0]', 'kernel':'3x3',
+                       'group':'1', 'dilation':'[1 1]', 'flag_bias':'1',
+                       'flag_act':'0', 'dtype':'float'}
+    batchnorm_param_dict = {'epsilon':'1e-4f', 'momentum':'0.9f',
+                            'dtype':'float'}
+    pooling_param_dict = {'stride':'2', 'pad':'0', 'kernel':'2x2', 'ceil_mode':'0',
+                          'flag_global':'0', 'exclusive':'1', 'pooling_type': 'max',
+                          'dtype':'float'}
+    activation_param_dict = {'act_type':'relu', 'dtype':'float'}
+    fc_param_dict = {'param_dim':'1x1','flag_bias':'1', 'dtype':'float'}
+    op_info = {}
+    cur_op_name = ''
+    cur_param_dict = {}
+    input_dims = ''
+    output_dims = ''
+    runtime_cmd = []
+    fid = open(args.ops_path, 'r')
+    handle = open(args.latency_lookup_table_path, 'w')
+    handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('dev_info'.ljust(30), 'armv7/v8'.ljust(10), 'core_num'.ljust(10), 'thread_num'.ljust(10), 'power_mode'.ljust(10), 'core0 arch'.ljust(10), 'core1 arch'.ljust(10),
+                    'core2 arch'.ljust(10), 'core3 arch'.ljust(10), 'core4 arch'.ljust(10), 'core5 arch'.ljust(10),
+                    'core6 arch'.ljust(10), 'core7 arch'.ljust(10)))
+    dev_info, core_num, arch_type = get_dev_info()
+    handle.write('{}\t{}\t{}\t{}'.format(dev_info.ljust(30), str(args.arm_v7_v8).ljust(10), str(core_num).ljust(10), str(args.threads).ljust(10), str(args.power_mode).ljust(10)))
+    for i in arch_type:
+        handle.write('\t{}'.format(i).ljust(10))
+    handle.write('\n')
+    handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('op_name'.ljust(10), 'input_dims'.ljust(10), 'output_dims'.ljust(10), 'param_info'.ljust(80), 'min_latency(ms)'.ljust(10), 'max_latency(ms)'.ljust(10), 'avg_latency(ms)'.ljust(10)))
+    for line in fid.readlines():
+        line = [line.strip('\n')]
+        for data_item in line:
+            data_item = data_item.strip().split('\t')
+            cur_op_name = data_item[0]
+            input_dims = data_item[1]
+            parameters = data_item[2].strip('( )').split(',')
+            for item_ in parameters:
+                item_ = item_.strip().split('=')
+                # conv op dict
+                if cur_op_name == 'conv':
+                    cur_param_dict = conv_param_dict
+                    if item_[0] == 'ch_out':
+                        cur_param_dict['ch_out'] = item_[1]
+                    elif item_[0] == 'stride':
+                        cur_param_dict['stride'] = item_[1]
+                    elif item_[0] == 'pad':
+                        cur_param_dict['pad'] = item_[1]
+                    elif item_[0] == 'kernel':
+                        cur_param_dict['kernel'] = item_[1]
+                    elif item_[0] == 'group':
+                        cur_param_dict['group'] = item_[1]
+                    elif item_[0] == 'dilation':
+                        cur_param_dict['dilation'] = item_[1]
+                    elif item_[0] == 'flag_bias':
+                        cur_param_dict['flag_bias'] = item_[1]
+                    elif item_[0] == 'flag_act':
+                        cur_param_dict['flag_act'] = item_[1]
+                    elif item_[0] == 'dtype':
+                        cur_param_dict['dtype'] = item_[1]
+                #batchnorm op dict
+                elif cur_op_name == 'batchnorm':
+                    cur_param_dict = batchnorm_param_dict
+                    if item_[0] == 'epsilon':
+                        cur_param_dict['epsilon'] = item_[1]
+                    elif item_[0] == 'momentum':
+                        cur_param_dict['momentum'] = item_[1]
+                #pooling op dict
+                elif cur_op_name == 'pooling':
+                    cur_param_dict = pooling_param_dict
+                    if item_[0] == 'stride':
+                        cur_param_dict['stride'] = item_[1]
+                    elif item_[0] == 'pad':
+                        cur_param_dict['pad'] = item_[1]
+                    elif item_[0] == 'kernel':
+                        cur_param_dict['kernel'] = item_[1]
+                    elif item_[0] == 'ceil_mode':
+                        cur_param_dict['ceil_mode'] = item_[1]
+                    elif item_[0] == 'flag_global':
+                        cur_param_dict['flag_global'] = item_[1]
+                    elif item_[0] == 'exclusive':
+                        cur_param_dict['exclusive'] = item_[1]
+                    elif item_[0] == 'pooling_type':
+                        cur_param_dict['pooling_type'] = item_[1]
+                #activation op dict
+                elif cur_op_name == 'activation':
+                    cur_param_dict = activation_param_dict
+                    if item_[0] == 'act_type':
+                        cur_param_dict['act_type'] = item_[1]
+                # fc op dict
+                elif cur_op_name == 'fc':
+                    cur_param_dict = fc_param_dict
+                    if item_[0] == 'param_dim':
+                        cur_param_dict['param_dim'] = item_[1]
+                    elif item_[0] == 'flag_bias':
+                        cur_param_dict['flag_bias'] = item_[1]
+                    elif item_[0] == 'dtype':
+                        cur_param_dict['dtype'] = 'float'
+        op_info[cur_op_name] = cur_param_dict
+
+        if cur_op_name == 'conv':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            out_ch = cur_param_dict['ch_out']
+            pad_top = cur_param_dict['pad'].strip('['  ']').split()[0]
+            pad_bottom = cur_param_dict['pad'].strip('['  ']').split()[1]
+            pad_left = cur_param_dict['pad'].strip('['  ']').split()[2]
+            pad_right = cur_param_dict['pad'].strip('['  ']').split()[0]
+            dila_h = cur_param_dict['dilation'].strip('['  ']').split()[0]
+            dila_w = cur_param_dict['dilation'].strip('['  ']').split()[1]
+            kernel_h = cur_param_dict['kernel'][0]
+            kernel_w = cur_param_dict['kernel'][2]
+            stride_h = cur_param_dict['stride'].strip('['  ']').split()[0]
+            stride_w = cur_param_dict['stride'].strip('['  ']').split()[1]
+            hout = (int(height) + int(pad_top) + int(pad_bottom) - int(dila_h) * 
+                   (int(kernel_h) - 1) + 1) / int(stride_h) + 1
+            wout = (int(width) + int(pad_left) + int(pad_right) - int(dila_w) * 
+                   (int(kernel_w) - 1) + 1) / int(stride_w) + 1
+            output_dims = '[' +  str(batch) + ' ' + str(out_ch) + ' ' + str(int(hout)) + ' ' + str(int(wout)) + ']'
+            dtype = 0
+            if cur_param_dict['dtype'] == 'float':
+                dtype = 0
+            elif cur_param_dict['dtype'] == 'int8_float':
+                dtype = 1
+            elif cur_param_dict['dtype'] == 'int8_int8':
+                dtype = 2
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width), str(out_ch),
+                           str(cur_param_dict['group']), str(cur_param_dict['kernel'])[0],
+                           str(pad_top), str(pad_bottom),
+                           str(pad_left), str(pad_right),
+                           str(stride_h), str(stride_w),
+                           str(dila_h), str(dila_w),
+                           str(cur_param_dict['flag_bias']), str(cur_param_dict['flag_act']),
+                           str(dtype)]
+        elif cur_op_name == 'batchnorm':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            output_dims = input_dims
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
+                           str(cur_param_dict['epsilon']), str(cur_param_dict['momentum'])]
+        elif cur_op_name == 'pooling':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            hout   = 1
+            wout   = 1
+            pad_top = cur_param_dict['pad'].strip('['  ']').split()[0]
+            pad_bottom = cur_param_dict['pad'].strip('['  ']').split()[1]
+            pad_left = cur_param_dict['pad'].strip('['  ']').split()[2]
+            pad_right = cur_param_dict['pad'].strip('['  ']').split()[3]
+            kernel_h = cur_param_dict['kernel'][0]
+            kernel_w = cur_param_dict['kernel'][2]
+            stride_h = cur_param_dict['stride'].strip('['  ']').split()[0]
+            stride_w = cur_param_dict['stride'].strip('['  ']').split()[1]
+            if cur_param_dict['flag_global'] == '0':
+                if cur_param_dict['ceil_mode'] == '0':
+                    hout = (int(height) - int(kernel_h) + int(pad_top) + int(pad_bottom)) / int(stride_h) + 1
+                    wout = (int(width) - int(kernel_w) + int(pad_left) + int(pad_right)) / int(stride_w) + 1
+                else:
+                    hout = (int(height) - int(kernel_h) + int(pad_top) + int(pad_bottom) + int(stride_h) - 1) / int(stride_h) + 1
+                    wout = (int(width) - int(kernel_w) + int(pad_left) + int(pad_right) + int(stride_w) - 1) / int(stride_w) + 1
+            output_dims = '[' + batch + ' ' + str(in_ch) + ' ' + str(int(hout)) + ' ' + str(int(wout)) + ']'
+            pooling_type = 0
+            if cur_param_dict['pooling_type'] == 'max':
+                pooling_type = 0
+            else:
+                pooling_type = 1
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
+                           str(stride_h), str(stride_w),
+                           str(pad_top), str(pad_bottom),
+                           str(pad_left), str(pad_right),
+                           str(cur_param_dict['kernel'])[0], str(cur_param_dict['ceil_mode']),
+                           str(cur_param_dict['flag_global']), str(cur_param_dict['exclusive']),
+                           str(pooling_type)]
+        elif cur_op_name == 'activation':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            act_type = 1
+            if cur_param_dict['act_type'] == 'relu':
+                act_type = 1
+            elif cur_param_dict['act_type'] == 'relu6':
+                act_type = 2
+            elif cur_param_dict['act_type'] == 'leaky_relu':
+                act_type = 4
+            elif cur_param_dict['act_type'] == 'sigmoid':
+                act_type = 5
+            elif cur_param_dict['act_type'] == 'tanh':
+                act_type = 6
+            elif cur_param_dict['act_type'] == 'swish':
+                act_type = 7
+            elif cur_param_dict['act_type'] == 'exp':
+                act_type = 8
+            elif cur_param_dict['act_type'] == 'abs':
+                act_type = 9
+            elif cur_param_dict['act_type'] == 'hard_swish':
+                act_type = 10
+            elif cur_param_dict['act_type'] == 'reciprocal':
+                act_type = 11
+            elif cur_param_dict['act_type'] == 'threshold_relu':
+                act_type = 12
+            output_dims = input_dims
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
+                           str(act_type)]
+        elif cur_op_name == 'fc':
+            m = input_dims.strip('['  ']').split()[0]
+            k = input_dims.strip('['  ']').split()[1]
+            n = cur_param_dict['param_dim'].split('x')[1]
+            output_dims = '[' + m + ' ' + n + ']'
+            runtime_cmd = [str(m), str(n), str(k), str(cur_param_dict['flag_bias']),
+                           str(cur_param_dict['dtype'])]
+
+        avg_latency, min_latency, max_latency = get_op_latency([cur_op_name] +
+                                 runtime_cmd + [str(args.threads), str(args.power_mode),
+                                 str(args.warmup_times), str(args.repeats_times)],
+                                 args.platform)
+
+        param_dict = ''
+        for k in cur_param_dict:
+            param_dict += str(k) + '=' + str(cur_param_dict[k]) + ','
+        param_dict = '(' + param_dict[:-1] + ')'
+        handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cur_op_name.ljust(10), input_dims.ljust(10), output_dims.ljust(10), param_dict.ljust(80), str(min_latency).ljust(10), str(max_latency).ljust(10), str(avg_latency).ljust(10)))
+
+    fid.close()
+    handle.close()
+    print('Congratulations! Get Latency LookUp Table is Completed.')
+
+if __name__ == '__main__':
+    main()
--- a/lite/tests/benchmark/latency_lookup_table.txt
+++ b/lite/tests/benchmark/latency_lookup_table.txt
+dev_info                      	armv7/v8  	core_num  	thread_num	power_mode	core0 arch	core1 arch	core2 arch	core3 arch	core4 arch	core5 arch	core6 arch	core7 arch
+Hisilicon Kirin980            	armv8     	8         	1         	ARM_A55  	ARM_A55  	ARM_A55  	ARM_A55  	ARM_A76  	ARM_A76  	ARM_A76  	ARM_A76  
+op_name   	input_dims	output_dims	param_info                                                                      	min_latency(ms)	max_latency(ms)	avg_latency(ms)
+conv      	[1 96 112 112]	[1 48 114 114]	(ch_out=48,stride=[1 1],pad=[0 0 0 0],kernel=1x1,group=1,dilation=[1 1],flag_bias=0,flag_act=0,dtype=float)	3.472     	5.384     	3.97393   
+fc        	[4 8]     	[4 1000]  	(param_dim=8x1000,flag_bias=1,dtype=float)                                      	0.009     	0.023     	0.00951   
+batchnorm 	[1 8 64 64]	[1 8 64 64]	(epsilon=1e-4f,momentum=0.9f,dtype=float)                                       	0.01      	0.012     	0.0114    
+pooling   	[1 8 64 64]	[1 8 32 32]	(stride=[2 2],pad=[0 0 0 0],kernel=2x2,ceil_mode=0,flag_global=0,exclusive=0,pooling_type=avg,dtype=float)	0.009     	0.01      	0.00969   
+activation	[1 8 64 64]	[1 8 64 64]	(act_type=relu,dtype=float)                                                     	0.01      	0.028     	0.01098   
--- a/lite/tests/benchmark/ops.txt
+++ b/lite/tests/benchmark/ops.txt
+conv	[1 96 112 112]	(ch_out=48, stride=[1 1], group=1, kernel=1x1, pad=[0 0 0 0], dilation=[1 1], flag_bias=0, flag_act=0, dtype=float)
+fc	[4 8]	(flag_bias=1, param_dim=8x1000)
+batchnorm	[1 8 64 64]	(epsilon=1e-4f, momentum=0.9f)
+pooling	[1 8 64 64]	(stride=[2 2], kernel=2x2, pad=[0 0 0 0], exclusive=0, pooling_type=avg)
+activation	[1 8 64 64]	(act_type=relu)
--- a/lite/tests/benchmark/src/get_activation_latency.cc
+++ b/lite/tests/benchmark/src/get_activation_latency.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include <memory>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/activation_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+using paddle::lite::profile::Timer;
+
+int main(int argc, char** argv) {
+  if (argc != 10) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <act_type>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>" << std::endl;
+    return 0;
+  }
+
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  int thread_num = atoi(argv[6]);
+  int power_mode = atoi(argv[7]);
+  int warmup = atoi(argv[8]);
+  int repeats = atoi(argv[9]);
+  int act_type = atoi(argv[5]);
+  const float six = 6.f;
+  const float leakey_relu_scale = 8.88f;
+
+#ifdef LITE_WITH_ARM
+  ActivationParam act_param;
+  Tensor x, y;
+  DDim dim_in = DDim({batch_size, input_channel, input_height, input_width});
+  x.set_precision(PRECISION(kFloat));
+  x.Resize(dim_in);
+  paddle::lite::fill_tensor_rand(x, -1.f, 1.f);
+  act_param.X = &x;
+  act_param.active_type = (paddle::lite_api::ActivationType)act_type;
+  act_param.has_active = true;
+
+  if (act_type == 2) {
+    act_param.Relu_clipped_coef = six;
+  } else if (act_type == 4) {
+    act_param.Leaky_relu_alpha = leakey_relu_scale;
+  }
+
+  act_param.Out = &y;
+  act_param.Out->set_precision(PRECISION(kFloat));
+  act_param.Out->Resize(dim_in);
+
+  Timer t0;
+  if (act_type == 1) {
+    paddle::lite::kernels::arm::ReluCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 2) {
+    paddle::lite::kernels::arm::Relu6Compute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 4) {
+    paddle::lite::kernels::arm::LeakyReluCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 5) {
+    paddle::lite::kernels::arm::SigmoidCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 6) {
+    paddle::lite::kernels::arm::TanhCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 7) {
+    paddle::lite::kernels::arm::SwishCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 8) {
+    paddle::lite::kernels::arm::ExpCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 9) {
+    paddle::lite::kernels::arm::AbsCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 10) {
+    paddle::lite::kernels::arm::HardSwishCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 11) {
+    paddle::lite::kernels::arm::ReciprocalCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 12) {
+    paddle::lite::kernels::arm::ThresholdedReluCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  }
+
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+#endif
+
+  return 0;
+}
--- a/lite/tests/benchmark/src/get_batchnorm_latency.cc
+++ b/lite/tests/benchmark/src/get_batchnorm_latency.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/batch_norm_compute.h"
+#include "lite/operators/op_params.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::kernels::arm::BatchNormCompute BatchNormCompute;
+using paddle::lite::profile::Timer;
+
+int main(int argc, char** argv) {
+  if (argc != 11) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <epsilon>\n"
+              << "  <momentum>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  float epsilon = atof(argv[5]);
+  float momentum = atof(argv[6]);
+  int thread_num = atoi(argv[7]);
+  int power_mode = atoi(argv[8]);
+  int warmup = atoi(argv[9]);
+  int repeats = atoi(argv[10]);
+
+#ifdef LITE_WITH_ARM
+  Tensor x;
+  Tensor scale;
+  Tensor bias;
+  Tensor mean;
+  Tensor variance;
+  Tensor y;
+  Tensor mean_out;
+  Tensor variance_out;
+  Tensor saved_mean;
+  Tensor saved_variance;
+
+  std::vector<int64_t> in_out_shape = {
+      batch_size, input_channel, input_height, input_width};
+  x.Resize(in_out_shape);
+  scale.Resize({input_channel});
+  bias.Resize({input_channel});
+  mean.Resize({input_channel});
+  variance.Resize({input_channel});
+  y.Resize(in_out_shape);
+  mean_out.Resize({input_channel});
+  variance_out.Resize({input_channel});
+  saved_mean.Resize({input_channel});
+  saved_variance.Resize({input_channel});
+  // initialize the data of input tensors
+  auto* x_data = x.mutable_data<float>();
+  auto* scale_data = scale.mutable_data<float>();
+  auto* bias_data = bias.mutable_data<float>();
+  auto* mean_data = mean.mutable_data<float>();
+  auto* variance_data = variance.mutable_data<float>();
+  for (int i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i % 64);
+  }
+  for (int i = 0; i < scale.dims().production(); i++) {
+    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+  }
+  for (int i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+  }
+  for (int i = 0; i < mean.dims().production(); i++) {
+    mean_data[i] = static_cast<float>(i) * 0.0565f;
+  }
+  for (int i = 0; i < variance.dims().production(); i++) {
+    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+  }
+
+  // prepare kernel params and run
+  BatchNormCompute batch_norm;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+  batch_norm.SetContext(std::move(ctx1));
+
+  paddle::lite::operators::BatchNormParam param;
+  param.x = &x;
+  param.scale = &scale;
+  param.bias = &bias;
+  param.mean = &mean;
+  param.variance = &variance;
+  param.is_test = false;
+  param.use_global_stats = true;
+  param.epsilon = epsilon;
+  param.momentum = momentum;
+  param.data_layout = DATALAYOUT(kNCHW);
+  param.y = &y;
+  param.mean_out = &mean_out;
+  param.variance_out = &variance_out;
+  param.saved_mean = &saved_mean;
+  param.saved_variance = &saved_variance;
+  batch_norm.SetParam(param);
+
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    batch_norm.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    batch_norm.Launch();
+    t0.Stop();
+  }
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+#endif
+
+  return 0;
+}
--- a/lite/tests/benchmark/src/get_conv_latency.cc
+++ b/lite/tests/benchmark/src/get_conv_latency.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/conv_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::operators::ConvParam ConvParam;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+
+using paddle::lite::profile::Timer;
+using paddle::lite_api::PrecisionType;
+
+DDim compute_out_dim(const DDim& dim_in,
+                     const paddle::lite::operators::ConvParam& param) {
+  DDim dim_out = dim_in;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  dim_out[1] = param.filter->dims()[0];
+  auto kernel_h = param.filter->dims()[2];
+  auto kernel_w = param.filter->dims()[3];
+  auto h = dim_in[2];
+  auto w = dim_in[3];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_top = paddings[0];
+  int pad_bottom = paddings[1];
+  int pad_left = paddings[2];
+  int pad_right = paddings[3];
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  auto kernel_exten = dila_h * (kernel_h - 1) + 1;
+  auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1;
+  kernel_exten = dila_w * (kernel_w - 1) + 1;
+  auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1;
+  dim_out[2] = hout;
+  dim_out[3] = wout;
+  return dim_out;
+}
+
+template <PrecisionType Ptype, PrecisionType OutType>
+void test_conv(const DDim& input_dims,
+               const DDim& weight_dims,
+               const int group,
+               const std::vector<int>& strides,
+               const std::vector<int>& pads,
+               const std::vector<int>& dilas,
+               const bool flag_bias,
+               const int flag_act,
+               const int thread_num,
+               const int power_mode,
+               const int warmup,
+               const int repeats,
+               const float leakey_relu_scale = 8.88f) {
+  ConvParam param;
+  Tensor x, f, y;
+  Tensor bias;
+  param.x = &x;
+  param.x->set_precision(Ptype);
+  param.filter = &f;
+  param.filter->Resize(weight_dims);
+  param.filter->set_precision(Ptype);
+  if (flag_bias) {
+    param.bias = &bias;
+    param.bias->Resize({weight_dims[0]});
+    param.bias->set_precision(PRECISION(kFloat));
+  }
+  param.strides = strides;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
+  param.groups = group;
+  const float six = 6.f;
+
+  if (Ptype == PRECISION(kInt8)) {
+    std::vector<float> scale_in{1.f / 127};
+    std::vector<float> scale_out(1, weight_dims.count(1, 4) / 127.f);
+    if (flag_act == 2) {
+      scale_out[0] = six / 127.f;
+    } else if (flag_act == 4) {
+      if (std::abs(leakey_relu_scale) > 1) {
+        scale_out[0] *= std::abs(leakey_relu_scale);
+      }
+    }
+    std::vector<float> scale_w(weight_dims[0], 1.f / 127);
+    param.input_scale = scale_in[0];
+    param.output_scale = scale_out[0];
+    param.weight_scale = scale_w;
+  }
+
+  if (flag_act > 0) {
+    ActivationParam act_param;
+    act_param.has_active = true;
+    act_param.active_type = (paddle::lite_api::ActivationType)
+        flag_act;  // 1-relu, 2-relu6, 4-leakyrelu
+    if (flag_act == 1) {
+      param.fuse_relu = true;
+    } else if (flag_act == 2) {
+      act_param.Relu_clipped_coef = six;
+    } else if (flag_act == 4) {
+      act_param.Leaky_relu_alpha = leakey_relu_scale;
+    }
+    param.activation_param = act_param;
+  }
+
+  param.output = &y;
+  param.output->set_precision(OutType);
+
+  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
+  if (flag_bias) {
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+  }
+
+  paddle::lite::kernels::arm::ConvCompute<Ptype, OutType> conv;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+
+  param.x->Resize(input_dims);
+  DDim dim_out = compute_out_dim(input_dims, param);
+  param.output->Resize(dim_out);
+  conv.SetParam(param);
+  conv.SetContext(std::move(ctx1));
+  conv.PrepareForRun();
+  paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    conv.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    conv.Launch();
+    t0.Stop();
+  }
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+}
+
+int main(int argc, char** argv) {
+  if (argc != 23) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <output_channel>\n"
+              << "  <group_size>\n"
+              << "  <kernel_size>\n"
+              << "  <pad_top>\n"
+              << "  <pad_bottom>\n"
+              << "  <pad_left>\n"
+              << "  <pad_right>\n"
+              << "  <stride_h>\n"
+              << "  <stride_w>\n"
+              << "  <dilation_h>\n"
+              << "  <dilation_w>\n"
+              << "  <flag_bias>\n"
+              << "  <flag_act>\n"
+              << "  <dtype>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  int output_channel = atoi(argv[5]);
+  int group_size = atoi(argv[6]);
+  int kernel_size = atoi(argv[7]);
+  int pad_top = atoi(argv[8]);
+  int pad_bottom = atoi(argv[9]);
+  int pad_left = atoi(argv[10]);
+  int pad_right = atoi(argv[11]);
+  int stride_h = atoi(argv[12]);
+  int stride_w = atoi(argv[13]);
+  int dilation_h = atoi(argv[14]);
+  int dilation_w = atoi(argv[15]);
+  int flag_bias = atoi(argv[16]);
+  int flag_act = atoi(argv[17]);
+  int dtype = atoi(argv[18]);
+  int thread_num = atoi(argv[19]);
+  int power_mode = atoi(argv[20]);
+  int warmup = atoi(argv[21]);
+  int repeats = atoi(argv[22]);
+
+  DDim weight_dims(
+      {output_channel, input_channel / group_size, kernel_size, kernel_size});
+  DDim input_dims({batch_size, input_channel, input_height, input_width});
+  switch (dtype) {
+    case 0:
+      test_conv<PRECISION(kFloat), PRECISION(kFloat)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+      break;
+    case 1:
+      test_conv<PRECISION(kInt8), PRECISION(kFloat)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+      break;
+    case 2:
+      test_conv<PRECISION(kInt8), PRECISION(kInt8)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+      break;
+    default:
+      test_conv<PRECISION(kFloat), PRECISION(kFloat)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+  }
+
+  return 0;
+}
--- a/lite/tests/benchmark/src/get_fc_latency.cc
+++ b/lite/tests/benchmark/src/get_fc_latency.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/fc_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::FcParam FcParam;
+using paddle::lite::profile::Timer;
+using paddle::lite_api::PrecisionType;
+
+template <PrecisionType Ptype, PrecisionType OutType>
+void test_fc(const int m,
+             const int n,
+             const int k,
+             const bool has_bias,
+             const int thread_num,
+             const int power_mode,
+             const int warmup,
+             const int repeats) {
+  FcParam param;
+  Tensor x, y, bias, w;
+  param.input = &x;
+  param.input->set_precision(Ptype);
+  param.input->Resize({m, k});
+  param.w = &w;
+  param.w->set_precision(Ptype);
+  param.w->Resize({k, n});
+  if (has_bias) {
+    param.bias = &bias;
+    param.bias->set_precision(Ptype);
+    param.bias->Resize({1, n});
+  } else {
+    param.bias = nullptr;
+  }
+  param.output = &y;
+  param.output->set_precision(OutType);
+  param.output->Resize({m, n});
+
+  param.in_num_col_dims = 1;
+  param.in_mat_dims = param.input->dims();
+
+  paddle::lite::kernels::arm::FcCompute<Ptype, OutType> fc_compute;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+  // set param and context
+  fc_compute.SetParam(param);
+  fc_compute.SetContext(std::move(ctx1));
+  // prepare for run
+  fc_compute.PrepareForRun();
+  paddle::lite::fill_tensor_rand(*param.input, -1.f, 1.f);
+  paddle::lite::fill_tensor_rand(*param.w, -1.f, 1.f);
+
+  if (has_bias) {
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+  }
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    fc_compute.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    fc_compute.Launch();
+    t0.Stop();
+  }
+
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+}
+
+int main(int argc, char** argv) {
+  if (argc != 10) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << " <m>\n"
+              << " <n>\n"
+              << " <k>\n"
+              << " <has_bias>\n"
+              << " <dtype>\n"
+              << " <thread_num>\n"
+              << " <power_mode>\n"
+              << " <warmup_times>\n"
+              << " <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int m = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int k = atoi(argv[3]);
+  bool has_bias = atoi(argv[4]) == 0 ? false : true;
+  int dtype = argv[5] == "int8_int8" ? 2 : argv[5] == "float_int8"
+                                               ? 1
+                                               : argv[5] == "float" ? 0 : 0;
+  int thread_num = atoi(argv[6]);
+  int power_mode = atoi(argv[7]);
+  int warmup = atoi(argv[8]);
+  int repeats = atoi(argv[9]);
+
+  switch (dtype) {
+    case 0:
+      test_fc<PRECISION(kFloat), PRECISION(kFloat)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+    case 1:
+      test_fc<PRECISION(kInt8), PRECISION(kFloat)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+    case 2:
+      test_fc<PRECISION(kInt8), PRECISION(kInt8)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+    default:
+      test_fc<PRECISION(kFloat), PRECISION(kFloat)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+  }
+
+  return 0;
+}
--- a/lite/tests/benchmark/src/get_pooling_latency.cc
+++ b/lite/tests/benchmark/src/get_pooling_latency.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/pool_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::PoolParam PoolParam;
+using paddle::lite::profile::Timer;
+
+DDim compute_out_dim(const DDim& dim_in,
+                     const paddle::lite::operators::PoolParam& param) {
+  DDim dim_out = dim_in;
+  auto kernel_h = param.ksize[0];
+  auto kernel_w = param.ksize[1];
+  auto h = dim_in[2];
+  auto w = dim_in[3];
+  auto paddings = *param.paddings;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  bool ceil_mode = param.ceil_mode;
+  bool flag_global = param.global_pooling;
+  int hout = 1;
+  int wout = 1;
+  if (!flag_global) {
+    if (!ceil_mode) {
+      hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
+      wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
+    } else {
+      hout =
+          (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
+          1;
+      wout =
+          (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
+          1;
+    }
+  }
+  dim_out[2] = hout;
+  dim_out[3] = wout;
+  return dim_out;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 20) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <kernel_size>\n"
+              << "  <stride_size>\n"
+              << "  <pad_size>\n"
+              << "  <exclusive>\n"
+              << "  <pooling_type>\n"
+              << "  <ceil_mode>\n"
+              << "  <flag_global>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  int stride_h = atoi(argv[5]);
+  int stride_w = atoi(argv[6]);
+  int pad_top = atoi(argv[7]);
+  int pad_bottom = atoi(argv[8]);
+  int pad_left = atoi(argv[9]);
+  int pad_right = atoi(argv[10]);
+  int kernel_size = atoi(argv[11]);
+  bool ceil_mode = argv[12] == 0 ? false : true;
+  bool flag_global = argv[13] == 0 ? false : true;
+  bool exclusive = atoi(argv[14]) == 0 ? false : true;
+  std::string pooling_type = atoi(argv[15]) == 0 ? "max" : "avg";
+  int thread_num = atoi(argv[16]);
+  int power_mode = atoi(argv[17]);
+  int warmup = atoi(argv[18]);
+  int repeats = atoi(argv[19]);
+
+#ifdef LITE_WITH_ARM
+  PoolParam param;
+  Tensor x, y;
+  param.x = &x;
+  param.x->set_precision(PRECISION(kFloat));
+  param.ksize = {kernel_size, kernel_size};
+  param.strides = {stride_h, stride_w};
+  param.paddings = std::make_shared<std::vector<int>>(
+      std::vector<int>{pad_top, pad_bottom, pad_left, pad_right});
+  param.ceil_mode = ceil_mode;
+  param.global_pooling = flag_global;
+  param.pooling_type = pooling_type;
+  param.exclusive = exclusive;
+  param.adaptive = false;
+  param.use_quantizer = false;
+  param.output = &y;
+  param.output->set_precision(PRECISION(kFloat));
+
+  paddle::lite::kernels::arm::PoolCompute pool;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+  // set param and context
+  pool.SetParam(param);
+  pool.SetContext(std::move(ctx1));
+  // prepare for run
+  pool.PrepareForRun();
+  DDim dim_in = DDim({batch_size, input_channel, input_height, input_width});
+  DDim dim_out = compute_out_dim(dim_in, param);
+
+  param.x->Resize(dim_in);
+  param.output->Resize(dim_out);
+
+  paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    pool.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    pool.Launch();
+    t0.Stop();
+  }
+
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+#endif
+
+  return 0;
+}
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -130,7 +130,6 @@ void TestCast(Place place, float abs_error, int in_dtype, int out_dtype) {
 }

 TEST(Cast, precision) {
-  LOG(INFO) << "test cast op";
  Place place;
  float abs_error = 2e-5;
 #if defined(LITE_WITH_ARM)
@@ -150,7 +149,7 @@ TEST(Cast, precision) {
  TestCast(place, abs_error, 20, 5);
 #endif
  TestCast(place, abs_error, 2, 5);
-#if defined(LITE_WITH_XPU) || defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+#if defined(LITE_WITH_HUAWEI_ASCEND_NPU)
  TestCast(place, abs_error, 3, 5);
  TestCast(place, abs_error, 5, 3);
 #endif

--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -121,9 +121,9 @@ class FcOPTest : public arena::TestCase {
    int k = wdims_[0];
    int n = wdims_[1];

-    LOG(INFO) << "M=" << m << ", N=" << n << ", K=" << k
-              << ", bias=" << flag_bias << ", with_relu=" << with_relu_
-              << ", padding_weights=" << padding_weights_;
+    VLOG(4) << "M=" << m << ", N=" << n << ", K=" << k << ", bias=" << flag_bias
+            << ", with_relu=" << with_relu_
+            << ", padding_weights=" << padding_weights_;

    if (m == 1) {
      basic_gemv(n,

--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace lite {

+template <class T = float, class R = int64_t>
 class GatherComputeTest : public arena::TestCase {
 protected:
  // common attributes for this op.
@@ -53,9 +54,9 @@ class GatherComputeTest : public arena::TestCase {
    out_dims[0] = batch_size;
    out->Resize(out_dims);

-    auto x_data = x->data<int64_t>();
-    auto index_data = index->data<int64_t>();
-    auto out_data = out->mutable_data<int64_t>();
+    auto x_data = x->template data<T>();
+    auto index_data = index->template data<R>();
+    auto out_data = out->template mutable_data<T>();

    auto slice_num = x_dims[0];
    auto slice_size = x_dims.Slice(1, x_dims.size()).production();
@@ -66,7 +67,7 @@ class GatherComputeTest : public arena::TestCase {
      CHECK_GE(index, 0) << "gather ids[i] expected >= 0 but got " << index;
      memcpy(out_data + i * slice_size,
             x_data + index * slice_size,
-             slice_size * sizeof(int64_t));
+             slice_size * sizeof(T));
    }
  }

@@ -78,11 +79,12 @@ class GatherComputeTest : public arena::TestCase {
  }

  void PrepareData() override {
-    std::vector<int64_t> x(x_dims_.production());
-    fill_data_rand(x.data(), int64_t(-1), int64_t(1), x_dims_.production());
+    std::vector<T> x(x_dims_.production());
+    fill_data_rand(
+        x.data(), static_cast<T>(-1), static_cast<T>(1), x_dims_.production());

-    std::vector<int64_t> index(index_dims_.production());
-    fill_data_rand<int64_t>(
+    std::vector<R> index(index_dims_.production());
+    fill_data_rand<R>(
        index.data(), 0, x_dims_[0] - 1, index_dims_.production());

    SetCommonTensor(x_, x_dims_, x.data());
@@ -90,8 +92,20 @@ class GatherComputeTest : public arena::TestCase {
  }
 };

+template <class T = float, class R = int64_t>
+void TestGather(const std::vector<int64_t>& x_dims,
+                const std::vector<int64_t>& index_dims,
+                Place place,
+                float abs_error = 1e-5,
+                const std::string& alias = "def") {
+  std::unique_ptr<arena::TestCase> tester(new GatherComputeTest<T, R>(
+      place, alias, DDim(x_dims), DDim(index_dims)));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
 TEST(Gather, precision) {
-  float abs_error = 2e-5;
+  float abs_error = 1e-5;
  Place place;
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
@@ -110,10 +124,14 @@ TEST(Gather, precision) {
  for (auto x_dims :
       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
    for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) {
-      std::unique_ptr<arena::TestCase> tester(new GatherComputeTest(
-          place, "int64", DDim(x_dims), DDim(index_dims)));
-      arena::Arena arena(std::move(tester), place, abs_error);
-      arena.TestPrecision();
+#if defined(LITE_WITH_XPU) || defined(LITE_WITH_NPU)
+      TestGather<float, int>(x_dims, index_dims, place, abs_error, "def");
+#else
+      TestGather<float, int64_t>(x_dims, index_dims, place, abs_error, "int64");
+      TestGather<int64_t, int64_t>(
+          x_dims, index_dims, place, abs_error, "int64");
+      TestGather<float, int>(x_dims, index_dims, place, abs_error, "int32");
+#endif
    }
  }
 }

--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -738,7 +738,7 @@ TEST(PriorBox, precision) {
 }

 TEST(DensityPriorBox, precision) {
-#ifdef LITE_WITH_X86
+#if defined(LITE_WITH_X86) && !defined(LITE_WITH_XPU)
  Place place(TARGET(kX86));
  test_density_prior_box(place);
 #endif

--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -372,7 +372,7 @@ function make_x86 {
  build_directory=$BUILD_DIR/build.lite.x86

  if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then
-    export CXX=/usr/bin/g++ # Ascend need g++ in centos
+    export CXX=g++ # Huawei Ascend NPU need g++
    build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu
  fi


--- a/lite/tools/build_windows.bat
+++ b/lite/tools/build_windows.bat