Merge branch 'develop' into baolei/bitmain

e7d31e1f · Santa An · GitHub · 3b3abcbb · f3aad572 · e7d31e1f
217 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -63,6 +63,16 @@ test/models/
 test/images/
+*.pyc
+# model
+*.nb
+*.svg
+*.dot
+# vim intermediate files
+*.swp
 # Emacs intermediate files
 *~

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -45,7 +45,7 @@ else()
        # we changed the source code to adapt for windows compiling
        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
        ######################################################################################################
-        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        URL             http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}

--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
 4、 `demo` 文件夹：c++ demo.
-如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。
 ## 运行
@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
 二： 运行   
-**NOTE:**此处示例使用的是python接口。
+**NOTE：** 此处示例使用的是python接口。
 ``` python
 #-*- coding: utf-8 -*-
@@ -75,7 +75,7 @@ import sys
 import numpy as np
 import cv2
 sys.path.append('build_cuda/inference_lite_lib/python/lib')
-from lite_core import *
+from lite import *
 def read_img(im_path, resize_h, resize_w):
  im = cv2.imread(im_path).astype('float32')

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_SHUTDOWN_LOG)
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
  lite_cc_library(place SRCS paddle_place.cc DEPS glog)

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
+// get param names
+std::vector<std::string> Predictor::GetParamNames() {
+  return exec_scope_->AttributeVarNames();
+}
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
  if (!program_) {
@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
+    if (valid_place.target == TARGET(kOpenCL)) continue;
    inner_places.emplace_back(
        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
  }
@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {
 const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
  return &var->Get<lite::Tensor>();
 }
+lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
+  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
+  return var->GetMutable<lite::Tensor>();
+}
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  auto element = std::find(input_names_.begin(), input_names_.end(), name);

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -85,6 +85,9 @@ class LITE_API Predictor {
  // get inputnames and get outputnames.
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
+  // get param names
+  std::vector<std::string> GetParamNames();
  void PrepareFeedFetch();
  // Get offset-th col of fetch results.
@@ -92,6 +95,9 @@ class LITE_API Predictor {
  std::vector<const lite::Tensor*> GetOutputs() const;
  const cpp::ProgramDesc& program_desc() const;
+  // get a mutable tensor according to its name
+  lite::Tensor* GetMutableTensor(const std::string& name);
+  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;
@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
  // get inputs names and get outputs names
  std::vector<std::string> GetInputNames() override;
  std::vector<std::string> GetOutputNames() override;
+  // get param names
+  std::vector<std::string> GetParamNames() override;
+  // get tensor according to tensor's name
  std::unique_ptr<const lite_api::Tensor> GetTensor(
      const std::string& name) const override;
+  // get a mutable tensor according to tensor's name
+  std::unique_ptr<lite_api::Tensor> GetMutableTensor(
+      const std::string& name) override;
  // Get InputTebsor by name
  std::unique_ptr<lite_api::Tensor> GetInputByName(

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
  return raw_predictor_.GetInputNames();
 }
+std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
+  return raw_predictor_.GetParamNames();
+}
 std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
  return raw_predictor_.GetOutputNames();
 }
@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
  return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
 }
+std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
+    const std::string &name) {
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
+}
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
    const std::string &name) {
  return std::unique_ptr<lite_api::Tensor>(

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
 DEFINE_string(input_shape_0,
              "1,3,224,224",
              "input shapes another, separated by colon and comma");
+DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
+  if (FLAGS_target == "arm") {
    config.set_valid_places({
        Place{TARGET(kARM), PRECISION(kFloat)},
    });
+  } else if (FLAGS_target == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        Place{TARGET(kARM)},  // enable kARM CPU kernel when no opencl kernel
+    });
+  }
  auto predictor = lite_api::CreatePaddlePredictor(config);
  // delete old optimized model
@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         int tid,
         const int warmup_times = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);
@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);
@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);
  auto predictor = lite_api::CreatePaddlePredictor(config);
-  config.set_model_dir(model_dir_0);
+  config.set_model_from_file(model_dir_0 + ".nb");
  auto predictor_0 = lite_api::CreatePaddlePredictor(config);
  for (int i = 0; i < 2 * repeat; i += 2) {
@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  if (FLAGS_model_dir == "") {
    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model";
+              << "--model_dir /path/to/your/model --model_dir_0 "
+                 "/path/to/your/model0  --target `arm` or `opencl`";
    exit(0);
  }
  std::string save_optimized_model_dir = "";

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
    optimize_out_type,
-    "protobuf",
+    "naive_buffer",
    "store type of the output optimized model. protobuf/naive_buffer");
 DEFINE_bool(display_kernels, false, "Display kernel information");
 DEFINE_bool(record_tailoring_info,
@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
  }
  std::cout << std::setiosflags(std::ios::internal);
  std::cout << std::setw(maximum_optype_length) << "OP_name";
-  for (int i = 0; i < targets.size(); i++) {
+  for (size_t i = 0; i < targets.size(); i++) {
    std::cout << std::setw(10) << targets[i].substr(1);
  }
  std::cout << std::endl;
@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
    for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
      std::cout << std::setw(maximum_optype_length) << it->first;
      auto ops_valid_places = it->second;
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
        if (std::find(ops_valid_places.begin(),
                      ops_valid_places.end(),
                      targets[i]) != ops_valid_places.end()) {
@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
      }
      // Print OP info.
      auto ops_valid_places = supported_ops.at(*op);
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
        if (std::find(ops_valid_places.begin(),
                      ops_valid_places.end(),
                      targets[i]) != ops_valid_places.end()) {
@@ -288,11 +288,11 @@ void ParseInputCommand() {
    auto valid_places = paddle::lite_api::ParserValidPlaces();
    // get valid_targets string
    std::vector<TargetType> target_types = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
      target_types.push_back(valid_places[i].target);
    }
    std::string targets_str = TargetToStr(target_types[0]);
-    for (int i = 1; i < target_types.size(); i++) {
+    for (size_t i = 1; i < target_types.size(); i++) {
      targets_str = targets_str + TargetToStr(target_types[i]);
    }
@@ -301,7 +301,7 @@ void ParseInputCommand() {
    target_types.push_back(TARGET(kUnk));
    std::set<std::string> valid_ops;
-    for (int i = 0; i < target_types.size(); i++) {
+    for (size_t i = 0; i < target_types.size(); i++) {
      auto ops = supported_ops_target[static_cast<int>(target_types[i])];
      valid_ops.insert(ops.begin(), ops.end());
    }
@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
  valid_ops.insert(
      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
-  for (int i = 0; i < valid_places.size(); i++) {
+  for (size_t i = 0; i < valid_places.size(); i++) {
    auto target = valid_places[i].target;
    auto ops = supported_ops_target[static_cast<int>(target)];
    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
@@ -340,7 +340,7 @@ void CheckIfModelSupported() {
  std::set<std::string> unsupported_ops;
  std::set<std::string> input_model_ops;
-  for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
    }
    std::vector<TargetType> targets = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
      targets.push_back(valid_places[i].target);
    }
    std::sort(targets.begin(), targets.end());
    targets.erase(unique(targets.begin(), targets.end()), targets.end());
    std::string targets_str = TargetToStr(targets[0]);
-    for (int i = 1; i < targets.size(); i++) {
+    for (size_t i = 1; i < targets.size(); i++) {
      targets_str = targets_str + "," + TargetToStr(targets[i]);
    }

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
         "command argument 'valid_targets'";
 }
-void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
+void OptBase::SetLiteOut(const std::string& lite_out_name) {
-  optimize_out_path_ = optimized_out_path;
+  lite_out_name_ = lite_out_name;
 }
-void OptBase::RunOptimize(bool record_strip_info) {
+void OptBase::RecordModelInfo(bool record_strip_info) {
+  record_strip_info_ = record_strip_info;
+}
+void OptBase::Run() {
  CheckIfModelSupported(false);
  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
  opt_config_.set_valid_places(valid_places_);
  if (model_set_dir_ != "") {
-    RunOptimizeFromModelSet(record_strip_info);
+    RunOptimizeFromModelSet(record_strip_info_);
  } else {
    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
    opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info_);
    auto resulted_model_name =
-        record_strip_info ? "information of striped model" : "optimized model";
+        record_strip_info_ ? "information of striped model" : "optimized model";
    std::cout << "Save the " << resulted_model_name
-              << " into :" << optimize_out_path_ << "successfully";
+              << " into :" << lite_out_name_ << "successfully";
  }
 }
+void OptBase::RunOptimize(const std::string& model_dir_path,
+                          const std::string& model_path,
+                          const std::string& param_path,
+                          const std::string& valid_places,
+                          const std::string& optimized_out_path) {
+  SetModelDir(model_dir_path);
+  SetModelFile(model_path);
+  SetParamFile(param_path);
+  SetValidPlaces(valid_places);
+  SetLiteOut(optimized_out_path);
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info_);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        lite_out_name_, model_type_, record_strip_info_);
+    auto resulted_model_name =
+        record_strip_info_ ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << lite_out_name_ << "successfully";
+  }
+}
 // collect ops info of modelset
 void CollectModelMetaInfo(const std::string& output_dir,
                          const std::vector<std::string>& models,
@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
 }
 void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
  // 1. mkdir of outputed optimized model set.
-  lite::MkDirRecur(optimize_out_path_);
+  lite::MkDirRecur(lite_out_name_);
  auto model_dirs = lite::ListDir(model_set_dir_, true);
  if (model_dirs.size() == 0) {
    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
    std::string input_model_dir =
        lite::Join<std::string>({model_set_dir_, name}, "/");
    std::string output_model_dir =
-        lite::Join<std::string>({optimize_out_path_, name}, "/");
+        lite::Join<std::string>({lite_out_name_, name}, "/");
    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
      auto model_file_path =
@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
    opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info);
    std::cout << "Optimize done. ";
  }
@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
  if (record_strip_info) {
    // Collect all models information
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
-    CollectModelMetaInfo(optimize_out_path_,
-                         model_dirs,
-                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
    std::cout << "Record the information of stripped models into :"
-              << optimize_out_path_ << "successfully";
+              << lite_out_name_ << "successfully";
  }
 }
 void OptBase::PrintHelpInfo() {
  const std::string opt_version = lite::version();
  const char help_info[] =
-      "At least one argument should be inputed. Valid arguments are listed "
+      "------------------------------------------------------------------------"
-      "below:\n"
+      "-----------------------------------------------------------\n"
+      "  Valid arguments of Paddle-Lite opt are listed below:\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n"
      "  Arguments of help information:\n"
      "        `help()`   Print help infomation\n"
-      "  Arguments of model optimization:\n"
+      "\n"
+      "  Arguments of model transformation:\n"
      "        `set_model_dir(model_dir)`\n"
      "        `set_model_file(model_file_path)`\n"
      "        `set_param_file(param_file_path)`\n"
-      "        `set_model_type(protobuf|naive_buffer)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
-      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "default\n"
+      "        `set_lite_out(output_optimize_model_dir)`\n"
      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
-      "        `run_optimize(false|true)`\n"
+      "        `record_model_info(false|true)`: refer to whether to record ops "
-      "        `  ----fasle&true refer to whether to record ops info for "
+      "info for striping lib, false by default`\n"
-      "tailoring lib, false by default`\n"
+      "        `run() : start model transformation`\n"
-      "  Arguments of model checking and ops information:\n"
+      "    eg. `opt.set_model_dir(\"./mobilenetv1\"); "
+      "opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
+      "opt.run();`\n"
+      "\n"
+      "  You can also transform model through a single input argument:\n"
+      "        `run_optimize(model_dir, model_file_path, param_file_path, "
+      "model_type, valid_places, lite_out_name) `\n"
+      "    eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
+      "\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
+      "\n"
+      "  Arguments of checking model and printing ops information:\n"
      "        `print_all_ops()`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `print_supported_ops`   Display supported operators of valid "
      "places\n"
      "        `check_if_model_supported()`   Check if the input model is "
-      "supported\n";
+      "supported\n"
+      "------------------------------------------------------------------------"
-  std::cout << "opt version:" << opt_version << std::endl
+      "-----------------------------------------------------------\n";
-            << help_info << std::endl;
+  std::cout << "opt version:" << opt_version << std::endl << help_info;
 }
 // 2. Print supported info of inputed ops
 void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {

--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
@@ -44,16 +44,21 @@ class LITE_API OptBase {
 public:
  OptBase() = default;
  void SetModelSetDir(const std::string &model_set_path);
-  void SetModelDir(const std::string &model_path);
+  void SetModelDir(const std::string &model_dir_path);
  void SetModelFile(const std::string &model_path);
  void SetParamFile(const std::string &param_path);
  void SetValidPlaces(const std::string &valid_places);
-  void SetOptimizeOut(const std::string &optimized_out_path);
+  void SetLiteOut(const std::string &lite_out_name);
+  void RecordModelInfo(bool record_strip_info = true);
  // set optimized_model type
  void SetModelType(std::string model_type);
  // transform and save the optimized model
-  void RunOptimize(bool record_strip_info = false);
+  void Run();
+  void RunOptimize(const std::string &model_dir_path = "",
+                   const std::string &model_path = "",
+                   const std::string &param_path = "",
+                   const std::string &valid_places = "",
+                   const std::string &optimized_out_path = "");
  // fuctions of printing info
  // 1. help info
  void PrintHelpInfo();
@@ -71,12 +76,12 @@ class LITE_API OptBase {
  // valid places for the optimized_model
  std::vector<Place> valid_places_;
  // filename of the optimized_model
-  std::string optimize_out_path_;
+  std::string lite_out_name_;
  // type of the optimized_model, kNaiveBuffer default.
  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
  // Dir path of a set of models, this should be combined with model
  std::string model_set_dir_;
+  bool record_strip_info_{false};
  void RunOptimizeFromModelSet(bool record_strip_info = false);
 };

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
+std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
+    const std::string &name) {
+  LOG(FATAL)
+      << "The GetMutableTensor API is only supported by CxxConfig predictor.";
+  return nullptr;
+}
+std::vector<std::string> PaddlePredictor::GetParamNames() {
+  std::vector<std::string> null_result = {};
+  LOG(FATAL)
+      << "The GetParamNames API is only supported by CxxConfig predictor.";
+  return null_result;
+}
 void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
                                         LiteModelType model_type,
                                         bool record_info) {

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
  virtual std::vector<std::string> GetInputNames() = 0;
  // Get output names
  virtual std::vector<std::string> GetOutputNames() = 0;
+  // Get output names
+  virtual std::vector<std::string> GetParamNames();
  // Get Input by name
  virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
  /// Get a readonly tensor, return null if no one called `name` exists.
  virtual std::unique_ptr<const Tensor> GetTensor(
      const std::string& name) const = 0;
+  /// Get a mutable tensor, return null if on one called `name` exists
+  /// internal infereces API, not recommanded.
+  virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
  /// Persist the optimized model to disk. This API is only supported by
  /// CxxConfig, and the persisted model can be reused for MobileConfig.
@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
 #endif
 #ifdef LITE_WITH_CUDA
  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
-  int multi_stream() const { return multi_stream_; }
+  bool multi_stream() const { return multi_stream_; }
 #endif
 #ifdef LITE_WITH_MLU
@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
  // current thread.
  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
  // XPU only, specify the target device ID for the current thread.
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
  void set_xpu_dev_per_thread(int dev_no = 0);
 };

--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -19,7 +19,13 @@
 #pragma once
 // some platform-independent defintion
-#include "lite/utils/macros.h"
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
 #define USE_LITE_OP(op_type__)       \
  extern int touch_op_##op_type__(); \

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
 USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
+USE_MIR_PASS(identity_dropout_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
 USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
+USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
+USE_MIR_PASS(__xpu__fc_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
      .def("set_model_file", &OptBase::SetModelFile)
      .def("set_param_file", &OptBase::SetParamFile)
      .def("set_valid_places", &OptBase::SetValidPlaces)
-      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_lite_out", &OptBase::SetLiteOut)
      .def("set_model_type", &OptBase::SetModelType)
+      .def("record_model_info", &OptBase::RecordModelInfo)
+      .def("run", &OptBase::Run)
      .def("run_optimize", &OptBase::RunOptimize)
      .def("help", &OptBase::PrintHelpInfo)
      .def("print_supported_ops", &OptBase::PrintSupportedOps)

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON':
 # link lite.so to paddlelite.libs
 if os.name != 'nt':
    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-    /inference_lite_lib/python/install/lite/lite.so"
+/inference_lite_lib/python/install/lite/lite.so"
    if os.system(COMMAND) != 0:
        raise Exception("patch third_party libs failed, command: %s" % COMMAND)

--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
+  const int pad_h0 = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
  int tile_h = (hout + 5) / 6;
  int size_tile = tile_h * tile_w;
-  int w_pad = win + pad_w * 2;
+  int w_pad = win + pad_w0 + pad_w1;
-  int h_pad = hin + pad_h * 2;
+  int h_pad = hin + pad_h0 + pad_h1;
  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
+                             -pad_h0,
-                             hin + pad_h,
+                             hin + pad_h1,
-                             -pad_w,
+                             -pad_w0,
-                             win + pad_w,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
+  const int pad_h0 = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;
-  int w_pad = win + pad_w * 2;
+  int w_pad = win + pad_w0 + pad_w1;
-  int h_pad = hin + pad_h * 2;
+  int h_pad = hin + pad_h0 + pad_h1;
  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
+                             -pad_h0,
-                             hin + pad_h,
+                             hin + pad_h1,
-                             -pad_w,
+                             -pad_w0,
-                             win + pad_w,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
                                const operators::ConvParam& param,
                                ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
+  const int pad_h0 = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;
-  int w_pad = win + pad_w * 2;
+  int w_pad = win + pad_w0 + pad_w1;
-  int h_pad = hin + pad_h * 2;
+  int h_pad = hin + pad_h0 + pad_h1;
  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
+                             -pad_h0,
-                             hin + pad_h,
+                             hin + pad_h1,
-                             -pad_w,
+                             -pad_w0,
-                             win + pad_w,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,

--- a/lite/backends/arm/math/lstm.cc
+++ b/lite/backends/arm/math/lstm.cc
@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
    for (int w = start_w; w < w_adds; ++w) {
      i_data[w] += b_data[w];
    }
+    i_data += width;
  }
 }
 void vector_dot(
@@ -67,15 +68,8 @@ void vector_dot(
  for (int i = 0; i < remain; ++i) {
    if (!v2) {
      out_ptr[i] = in_ptr[i] * v1_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
    } else {
      out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
-      ++v2_ptr;
    }
  }
 }

--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -21,6 +21,17 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
 void pooling_basic(const float* din,
                   float* dout,
                   int num,
@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
 #pragma omp parallel for
      for (int ind_c = 0; ind_c < chin; ++ind_c) {
        for (int ind_h = 0; ind_h < hout; ++ind_h) {
-          int sh = ind_h * stride_h;
+          int sh, eh;
-          int eh = sh + kernel_h;
+          if (adaptive) {
+            sh = AdaptStartIndex(ind_h, hin, hout);
+            eh = AdaptEndIndex(ind_h, hin, hout);
+          } else {
+            sh = ind_h * stride_h;
+            eh = sh + kernel_h;
            sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
            eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          }
          for (int ind_w = 0; ind_w < wout; ++ind_w) {
-            int sw = ind_w * stride_w;
+            int sw, ew;
-            int ew = sw + kernel_w;
+            if (adaptive) {
+              sw = AdaptStartIndex(ind_w, win, wout);
+              ew = AdaptEndIndex(ind_w, win, wout);
+            } else {
+              sw = ind_w * stride_w;
+              ew = sw + kernel_w;
              sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
              ew = (ew - pad_w) > win ? win : ew - pad_w;
+            }
            float result = static_cast<float>(0);
            int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
                          ind_h * wout + ind_w;

--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
 template <typename T>
 void scale(const T* din, T* dout, int num, T scale, T bias);
+template <typename T>
+void scale_relu(const T* din, T* dout, int num, T scale, T bias);
+template <typename T>
+void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
+template <typename T>
+void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
 template <typename T>
 void scale(const T* din,
           T* dout,

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,6 +28,7 @@ namespace lite {
 class CLContext {
 public:
  ~CLContext() {
+    GetCommandQueue().finish();
    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
      // Note(ysh329): Don't need `clReleaseKernel`
      kernels_[kidx].reset();

--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
  size_t width = in_image.ImageWidth();
  size_t height = in_image.ImageHeight();
  auto global_work_size = cl::NDRange{width, height};
-  cl::Event event;
  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
  CL_CHECK_FATAL(status);
  status = context->GetCommandQueue().finish();
  CL_CHECK_FATAL(status);
+#if 0
  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+#endif
  LOG(INFO) << out_image;
 }

--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
            i2 += 4;
            p++;
          } else {
-            image[i2] = 0.0;
+            image[i2] = Float2Half(0.f);
            i2 += 4;
          }
        }
@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
            image[index] = Float2Half(*p);
            p++;
          } else {
-            image[index] = 0.0;
+            image[index] = Float2Half(0.f);
          }
          if (index >= (width * height * 4)) {
            LOG(INFO) << " index out of range ";

--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 /////////////////////////////////
@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
 #endif
 #ifdef RELU6
-  output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
+  in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
 #endif
  return output;
 }
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -14,36 +14,30 @@ limitations under the License. */
 #include <cl_common.h>
 __kernel void relu(__read_only image2d_t input,
                   __write_only image2d_t output,
                   __private const float threshold,
                   __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f), in);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void relu6(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
-                   __private const float scale){
+                    __private const float scale) {
  const int x = get_global_id(0);
  const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void sigmoid(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
@@ -64,10 +57,11 @@ __kernel void sigmoid(__read_only image2d_t input,
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out;
-  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
-  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
-  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+  out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
+  out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
@@ -79,22 +73,21 @@ __kernel void leaky_relu(__read_only image2d_t input,
  const int x = get_global_id(0);
  const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
-  if (in.x < 0.0f){
+  if (in.x < 0.0f) {
    in.x = s_val.x;
  }
-  if (in.y < 0.0f){
+  if (in.y < 0.0f) {
    in.y = s_val.y;
  }
-  if (in.z < 0.0f){
+  if (in.z < 0.0f) {
    in.z = s_val.z;
  }
-  if (in.w < 0.0f){
+  if (in.w < 0.0f) {
    in.w = s_val.w;
  }
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
@@ -104,16 +97,14 @@ __kernel void tanh_act(__read_only image2d_t input,
                       __write_only image2d_t output,
                       __private const float threshold,
                       __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
+  CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
@@ -121,13 +112,11 @@ __kernel void exp_act(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
                      __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = exp(in);
@@ -138,16 +127,13 @@ __kernel void swish(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
                    __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
 #include <cl_common.h>
-__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
+__kernel void conv2d_1x1_opt(
+    __private const int global_size_dim0,
    __private const int global_size_dim1,
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
@@ -9,7 +10,7 @@ __kernel void conv2d_1x1_opt(__private const int global_size_dim0,
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image,
@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -27,33 +27,33 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                         __private const int offset,
                         __private const int input_c,
                         __private const int dilation,
-                         __private const int input_width,/* of one block */
+                         __private const int input_width,  /* of one block */
-                         __private const int input_height,/* of one block */
+                         __private const int input_height, /* of one block */
                         __private const int output_width,
                         __private const int output_height,
                         __private const int output_c,
                         __private const int filter_channel,
                         __private const int filter_width,
                         __private const int filter_height,
-                         __private const int group) {
+                         __private const int group,
+                         __private const int input_tensor_c
+) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                              CLK_ADDRESS_CLAMP          |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                              CLK_FILTER_NEAREST;
  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-    if (out_c >= global_size_dim0 ||
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-        out_w >= global_size_dim1 ||
      out_nh >= global_size_dim2) {
    return;
  }
  int2 stride_xy;
  stride_xy.x = stride;
  stride_xy.y = stride;
@@ -67,80 +67,167 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 #ifdef BIASE_CH
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
 #else
-    CL_DTYPE4 output = 0.0f;
+  CL_DTYPE4 output = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
  CL_DTYPE4 input[9];  // 3x3 region of input
  if (group == 1) {
    for (int i = 0; i < input_c; ++i) {  // each run for 3x3
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                           in_pos_in_one_block.y);
-            input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[0] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                     in_pos_in_one_block.y - dilation < 0 ||
+                     in_pos_in_one_block.x - dilation >= input_width ||
+                     in_pos_in_one_block.y - dilation >= input_height)
+                    << 15));
-            input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[1] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x, pos_in.y - dilation)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
-            input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[2] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                     in_pos_in_one_block.y - dilation < 0 ||
+                     in_pos_in_one_block.x + dilation >= input_width ||
+                     in_pos_in_one_block.y - dilation >= input_height)
+                    << 15));
-            input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[3] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x - dilation, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
-            input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[4] = select(
-                              (int2)(pos_in.x, pos_in.y)),
+          READ_IMG_TYPE(
-                              (CL_DTYPE4)(0.0f),
+              CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                     in_pos_in_one_block.x >= input_width ||
+                     in_pos_in_one_block.y >= input_height)
+                    << 15));
-            input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[5] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x + dilation, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
-            input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[6] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                     in_pos_in_one_block.y + dilation < 0 ||
+                     in_pos_in_one_block.x - dilation >= input_width ||
+                     in_pos_in_one_block.y + dilation >= input_height)
+                    << 15));
-            input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[7] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
-            input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[8] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                     in_pos_in_one_block.y + dilation < 0 ||
+                     in_pos_in_one_block.x + dilation >= input_width ||
+                     in_pos_in_one_block.y + dilation >= input_height)
+                    << 15));
+      if (i == input_c - 1) {
+        int c_shr = input_tensor_c % 4;
+        if (c_shr == 1) {
+          for (int k = 0; k < 9; k++) {
+            input[k].y = (half)0.f;
+            input[k].z = (half)0.f;
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 2) {
+          for (int k = 0; k < 9; k++) {
+            input[k].z = (half)0.f;
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 3) {
+          for (int k = 0; k < 9; k++) {
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 0) {
+        }
+      }
      int j = 0;
      int2 pos_of_weight;
      pos_of_weight.x = i * 3 + j % 3;
      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_x =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.x += dot(input[j], weight_x);
      pos_of_weight.y += 3;
-                CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_y =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.y += dot(input[j], weight_y);
      pos_of_weight.y += 3;
-                CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_z =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.z += dot(input[j], weight_z);
      pos_of_weight.y += 3;
-                CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_w =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.w += dot(input[j], weight_w);
      j = 1;
@@ -297,79 +384,98 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
                             in_pos_in_one_block.y);
        input[0] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-              (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
                       in_pos_in_one_block.y - dilation < 0 ||
                       in_pos_in_one_block.x - dilation >= input_width ||
                       in_pos_in_one_block.y - dilation >= input_height)
                      << 15));
        input[1] =
-              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x, pos_in.y - dilation)),
-                     (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x < 0 ||
                              in_pos_in_one_block.y - dilation < 0 ||
                              in_pos_in_one_block.x >= input_width ||
                              in_pos_in_one_block.y - dilation >= input_height)
                             << 15));
        input[2] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                       in_pos_in_one_block.y - dilation < 0 ||
                       in_pos_in_one_block.x + dilation >= input_width ||
                       in_pos_in_one_block.y - dilation >= input_height)
                      << 15));
-          input[3] = select(
+        input[3] =
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x - dilation, pos_in.y)),
-                          (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
                              in_pos_in_one_block.y < 0 ||
                              in_pos_in_one_block.x - dilation >= input_width ||
                              in_pos_in_one_block.y >= input_height)
                             << 15));
        input[4] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
-                          (CL_DTYPE4)(0.0f),
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
                       in_pos_in_one_block.x >= input_width ||
                       in_pos_in_one_block.y >= input_height)
                      << 15));
        input[5] =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x + dilation, pos_in.y)),
-                   (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                              in_pos_in_one_block.y < 0 ||
                              in_pos_in_one_block.x + dilation >= input_width ||
                              in_pos_in_one_block.y >= input_height)
                             << 15));
        input[6] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
                       in_pos_in_one_block.y + dilation < 0 ||
                       in_pos_in_one_block.x - dilation >= input_width ||
                       in_pos_in_one_block.y + dilation >= input_height)
                      << 15));
        input[7] =
-              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x, pos_in.y + dilation)),
-                     (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x < 0 ||
                              in_pos_in_one_block.y + dilation < 0 ||
                              in_pos_in_one_block.x >= input_width ||
                              in_pos_in_one_block.y + dilation >= input_height)
                             << 15));
        input[8] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                       in_pos_in_one_block.y + dilation < 0 ||
                       in_pos_in_one_block.x + dilation >= input_width ||
@@ -381,7 +487,8 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
          int2 pos_of_weight;
          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-            CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+          CL_DTYPE4 weight =
+              READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
          int f_c_offset = f_c % 4;
          CL_DTYPE f_value;

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cl_common.h>
-__kernel void depth_conv2d_3x3(__private const int global_size_dim0,
+__kernel void depth_conv2d_3x3(
+    __private const int global_size_dim0,
    __private const int global_size_dim1,
    __private const int global_size_dim2,
    __read_only image2d_t input,
@@ -28,7 +28,7 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
    __private const int offset,
    __private const int dilation,
    __private const int input_c,
-                                              __private const int input_width,/* of one block */
+    __private const int input_width,  /* of one block */
    __private const int input_height, /* of one block */
    __private const int output_width,
    __private const int output_height) {
@@ -39,23 +39,22 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
  const int batch_index = out_nh / output_height;
  const int out_nh_in_one_batch = out_nh % output_height;
  int2 stride_xy = (int2)(stride, stride);
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 #ifdef BIASE_CH
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
 #else
@@ -65,30 +64,66 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
  const int filter_width = 3;
  const int filter_height = 3;
-    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
-    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
-    int filter_x = pos_in_filter_block.x ;
+  int filter_x = pos_in_filter_block.x;
-    int filter_y = pos_in_filter_block.y ;
+  int filter_y = pos_in_filter_block.y;
  CL_DTYPE4 inputs[9];
-        inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+  inputs[0] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                 in_pos_in_one_block.y - 1 < 0 ||
-        inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+  inputs[1] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
-        inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+  inputs[2] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                 in_pos_in_one_block.y - 1 < 0 ||
-        inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+  inputs[3] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
  /*
  if (output_pos.x == 112 && output_pos.y == 0) {
        CL_DTYPE4 input1 = inputs[3];
@@ -98,45 +133,94 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
  }
  */
+  inputs[4] = select(
-        inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
-        inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+  inputs[5] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
-        inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+  inputs[6] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                 in_pos_in_one_block.y + 1 < 0 ||
-        inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+  inputs[7] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
-        inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+  inputs[8] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
      (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                 in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
  CL_DTYPE4 filters[9];
-    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+  filters[0] =
-    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+      READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
-    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+  filters[1] = READ_IMG_TYPE(
-    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
-    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+  filters[2] = READ_IMG_TYPE(
-    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
-    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+  filters[3] = READ_IMG_TYPE(
-    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
-    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+  filters[4] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
-    for(int i = 0 ;i < 9 ; i++){
+  filters[5] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
+  filters[6] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
+  for (int i = 0; i < 9; i++) {
    output += inputs[i] * filters[i];
  }
  output = activation_type4(output);
  /*
  if (output_pos.x == 112 && output_pos.y == 0) {
@@ -158,11 +242,8 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
  */
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
 }
 __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                 __private const int ou_w_blk,
                                 __private const int ou_nh,
@@ -176,7 +257,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                 __private const int pad,
                                 __private const int dilation,
                                 __private const int in_ch,
-                                              __private const int in_w,/* of one block */
+                                 __private const int in_w, /* of one block */
                                 __private const int in_h, /* of one block */
                                 __private const int ou_w,
                                 __private const int ou_h) {
@@ -195,19 +276,21 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  int col_id = ou_col_id - pad;
  int row_id = ou_row_id - pad;
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                              CLK_ADDRESS_CLAMP          |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                              CLK_FILTER_NEAREST;
 #ifdef BIASE_CH
  CL_DTYPE4 output[2];
-    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
  output[1] = output[0];
 #elif defined(BIASE_ELE)
  CL_DTYPE4 output[2];
-    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
  if (ou_col_id + 1 < ou_w) {
-        output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+    output[1] =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
  }
 #else
  CL_DTYPE4 output[2] = {0.0f};
@@ -218,9 +301,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  int filter_x = ou_ch_blk_id * 3;
  int filter_y = 0;
  CL_DTYPE4 filters[9];
-    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+  filters[0] =
-    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+      READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
-    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+  filters[1] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
  int in_x = mad24(ou_ch_blk_id, in_w, col_id);
  int in_y = mad24(batch_id, in_h, row_id);
@@ -244,11 +330,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  output[0] = mad(inputs[2], filters[2], output[0]);
  output[1] = mad(inputs[3], filters[2], output[1]);
+  filters[3] = READ_IMG_TYPE(
-    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
-    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+  filters[4] = READ_IMG_TYPE(
-    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
  int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
  inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
@@ -256,7 +343,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
  inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
  output[0] = mad(inputs[4], filters[3], output[0]);
  output[1] = mad(inputs[5], filters[3], output[1]);
@@ -266,10 +352,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  output[0] = mad(inputs[6], filters[5], output[0]);
  output[1] = mad(inputs[7], filters[5], output[1]);
+  filters[6] = READ_IMG_TYPE(
-    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
-    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+  filters[7] = READ_IMG_TYPE(
-    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
  int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
  inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
@@ -277,7 +365,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
  inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
  output[0] = mad(inputs[8], filters[6], output[0]);
  output[1] = mad(inputs[9], filters[6], output[1]);
@@ -290,10 +377,10 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
  output[0] = activation_type4(output[0]);
  output[1] = activation_type4(output[1]);
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
+  WRITE_IMG_TYPE(
+      CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
  if (ou_col_id + 1 < ou_w) {
-        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+    WRITE_IMG_TYPE(
+        CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
  }
 }
--- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -18,7 +18,7 @@ limitations under the License. */
 ////////////////////////////////////////////////////////
 // buffer -> image2d
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d(__global CL_DTYPE *in,
+__kernel void buffer_to_image2d(__global CL_DTYPE* in,
                                __write_only image2d_t output_image,
                                __private const int out_H,
                                __private const int out_W,
@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
                                __private const int Stride0,
                                __private const int Stride1,
                                __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
 #ifdef DEBUG
  if (out_w > 2045) {
-    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
+    printf(
+        "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
+        "%.2f\n",
        out_w,
        out_C - 4 * out_c,
        (float)(in[input_pos0]),
        (float)(in[input_pos1]),
        (float)(in[input_pos2]),
        (float)(in[input_pos3]));
-    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
-           output_pos.x, output_pos.y,
+           out_c,
-           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
+           out_w,
+           out_nh,
+           output_pos.x,
+           output_pos.y,
+           (float)(output.x),
+           (float)(output.y),
+           (float)(output.z),
+           (float)(output.w));
  }
 #endif
@@ -104,30 +112,38 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
+      CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
 #ifdef DEBUG
  if (in_w > 2045) {
-    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
-            pos_x, in_nh,
+           in_c,
-           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+           in_w,
+           in_nh,
+           pos_x,
+           in_nh,
+           (float)(in.x),
+           (float)(in.y),
+           (float)(in.z),
+           (float)(in.w));
  }
 #endif
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 3) {
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 4) {
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
  }
 }
 #if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 ////////////////////////////////////////////////////////
 // buffer -> image2d_nw
@@ -182,7 +198,6 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
 }
 #endif
 #if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 // image2d -> buffer
 __kernel void image2d_to_buffer_2d(__private const int in_height,
@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
 ////////////////////////////////////////////////////////
 // buffer -> image2d (divide by 255 to normalize)
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+__kernel void buffer_to_image2d_with_pre255(__global uchar* in,
                                            __write_only image2d_t output_image,
                                            __private const int out_H,
                                            __private const int out_W,
                                            __private const int out_C,
                                            __private const int Stride0,
                                            __private const int Stride1,
-                                            __private const int Stride2){
+                                            __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  const int in_h = out_h;
  const int in_w = out_w;
  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
@@ -243,19 +256,18 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
-  if(out_C - 4 * out_c>=2){
+  if (out_C - 4 * out_c >= 2) {
    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=3){
+  if (out_C - 4 * out_c >= 3) {
    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=4){
+  if (out_C - 4 * out_c >= 4) {
    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
  }
  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
 }
 ////////////////////////////////////////////////////////
 // image2d -> buffer (multiply by 255 to de-normalize)
 ////////////////////////////////////////////////////////
@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
+  CL_COMPUTE_DTYPE4 in =
+      READ_IMG_TYPE(
+          CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
+      255;
 #ifdef DEBUG
  printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
-          in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
+         in_c,
+         in_w,
+         in_nh,
+         pos_x,
+         in_nh,
+         in.x,
+         in.y,
+         in.z,
+         in.w);
 #endif
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = convert_uchar_sat(in.x);
-  if(C - 4 * in_c>=2){
+  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = convert_uchar_sat(in.y);
  }
-  if(C - 4 * in_c>=3){
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = convert_uchar_sat(in.z);
  }
-  if(C - 4 * in_c>=4){
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = convert_uchar_sat(in.w);
  }
 }
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
  bool is_device_init = InitializeDevice();
  is_init_success_ = is_platform_init && is_device_init;
  initialized_ = true;
+  context_ = CreateContext();
+  command_queue_ = CreateCommandQueue(context());
  return initialized_;
 }
@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
 cl::Context& CLRuntime::context() {
  if (context_ == nullptr) {
-    context_ = CreateContext();
+    LOG(FATAL) << "context_ create failed. ";
  }
  return *context_;
 }
@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
 cl::CommandQueue& CLRuntime::command_queue() {
  if (command_queue_ == nullptr) {
-    command_queue_ = CreateCommandQueue(context());
+    LOG(FATAL) << "command_queue_ create failed. ";
  }
  return *command_queue_;
 }
@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
  /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
-  std::string build_option = options + " -cl-fast-relaxed-math ";
+  std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
  VLOG(4) << "OpenCL build_option: " << build_option;
  status_ = program->build({*device_}, build_option.c_str());
  CL_CHECK_ERROR(status_);

--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                 size_t size,
                                 IoDirection dir) {
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                        0,
                                        size,
                                        nullptr,
-                                        &event);
+                                        nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
  cl::array<size_t, 3> origin = {0, 0, 0};
  cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
                                       origin,
                                       region,
                                       nullptr,
-                                       &event);
+                                       nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
-        output_data[i * in_dims[0] + j] =
+        output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
-            input_data[i * in_dims[0] + j] + vector_data[j];
      }
    }
  }

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
      }
    }
    if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
+      VLOG(3) << "no input has value! just return";
      return;
    }
    auto input_width = has_value_input->value().dims()[1];

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -19,6 +19,7 @@ namespace lite {
 #ifdef LITE_WITH_XPU
 thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
 #endif
 }  // namespace lite

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> {
    if (_tls_raw_ctx == nullptr) {
      _tls_raw_ctx = xdnn::create_context();
      CHECK(_tls_raw_ctx);
+      int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
+                                          _workspace_l3_size_per_thread);
+      if (r != 0) {
+        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
+                     << ", _workspace_l3_size_per_thread = "
+                     << _workspace_l3_size_per_thread;
+      }
    }
    return _tls_raw_ctx;
  }
  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
-    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+    _workspace_l3_size_per_thread = l3_size;
  }
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
  static void SetDev(int dev_no = 0) {
    const char* dev_env = getenv("LITE_XPU_DEV");
    if (dev_env) {
@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> {
 private:
  static thread_local xdnn::Context* _tls_raw_ctx;
+  static int _workspace_l3_size_per_thread;
 };
 #endif
@@ -340,27 +350,17 @@ class Context<TargetType::kX86> {
 template <>
 class Context<TargetType::kOpenCL> {
  std::shared_ptr<CLContext> cl_context_;
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
-  std::shared_ptr<WaitListType> cl_wait_list_;
 public:
  CLContext* cl_context() { return cl_context_.get(); }
-  WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
  void InitOnce() {
    // Init cl runtime.
    CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
    cl_context_ = std::make_shared<CLContext>();
-    cl_wait_list_ = std::make_shared<WaitListType>();
  }
-  void CopySharedTo(OpenCLContext* ctx) {
+  void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
-    ctx->cl_context_ = cl_context_;
-    ctx->cl_wait_list_ = cl_wait_list_;
-  }
 };
 #endif

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/scale_activation_fuse_pass.cc
      fusion/__xpu__resnet_fuse_pass.cc
      fusion/__xpu__multi_encoder_fuse_pass.cc
+      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+      fusion/__xpu__fc_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
+      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc

--- a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace {
+class Eliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    // the previous op's output need updat
+    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
+    // TODO(Superjomn) check has only one output
+    auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
+    auto* dropout_op = OpNode("dropout", "dropout")
+                           ->assert_op_attr<int>("is_test", 1)
+                           ->assert_op_attr<std::string>(
+                               "dropout_implementation", "upscale_in_train");
+    auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
+    auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
+    *pre_op >> *x >> *dropout_op >> *out;
+    *dropout_op >> *mask;
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    x->AsIntermediate();  // x is pre_op's output, need to update
+    dropout_op->AsIntermediate();
+    mask->AsIntermediate();
+  }
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& pre_op = matched.at("preop")->AsStmt();
+    auto op_info = *pre_op.op_info();
+    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
+                             matched.at("out")->AsArg().name);
+    pre_op.ResetOp(op_info, graph->valid_places());
+    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
+  }
+};
+}  // namespace
+class IdentityDropoutEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    Eliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
+                  paddle::lite::mir::IdentityDropoutEliminatePass)
+    .BindTargets({TARGET(kXPU)});
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
 lite_cc_library(fuse_sequence_pool_concat
        SRCS sequence_pool_concat_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scale_activation
+        SRCS scale_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 set(mir_fusers
    fuse_fc
@@ -44,6 +47,7 @@ set(mir_fusers
    fuse_transpose_softmax_transpose
    fuse_interpolate
    fuse_sequence_pool_concat
+    fuse_scale_activation
    CACHE INTERNAL "fusers")
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
+ public:
+  explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
+      : n_embedding_(n_embedding) {}
+  void BuildPattern() override {
+    auto* ids0 =
+        VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table0 =
+        VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding0 = OpNode("embedding0", "lookup_table");
+    auto* embedding_out0 = VarNode("embedding_out0")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "X")
+                               ->AsIntermediate();
+    auto* ids1 =
+        VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table1 =
+        VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
+    auto* embedding_out1 = VarNode("embedding_out1")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "Y")
+                               ->AsIntermediate();
+    auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
+    auto* ewadd01_out = VarNode("ewadd01_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->AsIntermediate();
+    embedding0->LinksFrom({ids0, table0});
+    embedding0->LinksTo({embedding_out0});
+    embedding1->LinksFrom({ids1, table1});
+    embedding1->LinksTo({embedding_out1});
+    ewadd01->LinksFrom({embedding_out0, embedding_out1});
+    ewadd01->LinksTo({ewadd01_out});
+    auto* last_ewadd_out = ewadd01_out;
+    for (int i = 2; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      auto embedding_name = paddle::lite::string_format("embedding%d", i);
+      auto embedding_out_name =
+          paddle::lite::string_format("embedding_out%d", i);
+      auto* new_ids = VarNode(ids_name)
+                          ->assert_is_op_input("lookup_table", "Ids")
+                          ->AsInput();
+      auto* new_table = VarNode(table_name)
+                            ->assert_is_op_input("lookup_table", "W")
+                            ->AsInput();
+      auto* new_embedding =
+          OpNode(embedding_name, "lookup_table")->AsIntermediate();
+      auto* new_embedding_out = VarNode(embedding_out_name)
+                                    ->assert_is_op_output("lookup_table", "Out")
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsIntermediate();
+      new_embedding->LinksFrom({new_ids, new_table});
+      new_embedding->LinksTo({new_embedding_out});
+      auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
+      auto ewadd_out_name = ewadd_name + "_out";
+      auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
+      auto* new_ewadd_out = VarNode(ewadd_out_name)
+                                ->assert_is_op_output("elementwise_add", "Out")
+                                ->AsIntermediate();
+      new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
+      new_ewadd->LinksTo({new_ewadd_out});
+      last_ewadd_out = new_ewadd_out;
+    }
+    last_ewadd_out->AsOutput();
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__embedding_with_eltwise_add");
+    std::vector<std::string> ids_names;
+    std::vector<std::string> table_names;
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      ids_names.push_back(matched.at(ids_name)->arg()->name);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      table_names.push_back(matched.at(table_name)->arg()->name);
+    }
+    op_desc.SetInput("Ids", ids_names);
+    op_desc.SetInput("Tables", table_names);
+    auto output_name = paddle::lite::string_format(
+        "ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
+    op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
+    op_desc.SetAttr<int>("n_embedding", n_embedding_);
+    auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
+    op_desc.SetAttr<int64_t>(
+        "padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
+    auto* new_stmt = matched.at("embedding0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      DirectedLink(matched.at(ids_name), matched.at("embedding0"));
+      DirectedLink(matched.at(table_name), matched.at("embedding0"));
+    }
+    IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
+  }
+ private:
+  int n_embedding_;
+};
+}  // namespace fusion
+class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    for (int n_embedding : {4, 3}) {
+      fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
+      fuser(graph.get());
+    }
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
+                  paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("lookup_table");
--- a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class XPUFcFuser : public FuseBase {
+ public:
+  explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
+  void BuildPattern() override {
+    // create nodes.
+    auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+    auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+    auto* b = VarNode("b")->assert_is_persistable_var();
+    auto* mul = OpNode("mul", "mul");
+    auto* mul_out = VarNode("mul_out");
+    auto* add = OpNode("add", "elementwise_add");
+    auto* Out = VarNode("Out");
+    // create topology.
+    std::vector<PMNode*> mul_inputs{W, x};
+    std::vector<PMNode*> add_inputs{mul_out, b};
+    mul_inputs >> *mul >> *mul_out;
+    // Some op specialities.
+    mul_out->AsIntermediate();
+    mul->AsIntermediate();
+    add->AsIntermediate();
+    if (with_relu_) {
+      auto* add_out = VarNode("add_out");
+      auto* relu = OpNode("relu", "relu");
+      std::vector<PMNode*> relu_inputs{add_out};
+      add_inputs >> *add >> *add_out;
+      relu_inputs >> *relu >> *Out;
+      add_out->AsIntermediate();
+      relu->AsIntermediate();
+    } else {
+      add_inputs >> *add >> *Out;
+    }
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto mul = matched.at("mul")->stmt()->op();
+    auto* scope = mul->scope();
+    // convert W from float to int16, and transpose W
+    auto weight_name = matched.at("W")->arg()->name;
+    auto* weight_t = scope->FindMutableTensor(weight_name);
+    auto weight_dims = weight_t->dims();
+    int weight_len = weight_t->numel();
+    float* weight_on_host = weight_t->mutable_data<float>();
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+    std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+    std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        weight_on_host, weight_int16.get(), max_f, weight_len);
+    paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                       weight_trans_int16.get(),
+                                       weight_dims[0],
+                                       weight_dims[1]);
+    memcpy(
+        weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
+    auto op_desc = GenOpDesc(matched, max_f, true);
+    auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
+    auto& valid_places = mul->valid_places();
+    fc_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+    IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+  }
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
+                        float w_max,
+                        bool transpose_w) {
+    cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__fc");
+    op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+    op_desc.SetInput("W", {matched.at("W")->arg()->name});
+    op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+    op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+    op_desc.SetAttr(
+        "in_num_col_dims",
+        matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+    op_desc.SetAttr("w_max", w_max);
+    op_desc.SetAttr("transpose_w", transpose_w);
+    if (with_relu_) {
+      op_desc.SetAttr("activation_type", std::string{"relu"});
+    }
+    return op_desc;
+  }
+  bool with_relu_;
+};
+}  // namespace fusion
+class XPUFcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUFcFuser fuser(true /* with_relu */);
+    fuser(graph.get());
+    fusion::XPUFcFuser fuser2(false /* with_relu */);
+    fuser2(graph.get());
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("fc");
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "lite/backends/xpu/math.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/type_precision_cast_pass.h"  // For UpdateInputs()
 #include "lite/core/mir/xpu_pattern_matcher_high_api.h"
 #include "lite/operators/subgraph_op.h"
@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
    multi_encoder_stmt->SetOp(multi_encoder_op);
    multi_encoder_stmt->SetKernels(std::move(kernels));
-    // temp remove useless cast
+    // remove dangling/useless cast
-    std::unordered_set<const Node*> to_remove2;
    Node* stack = nullptr;
    for (auto* node : graph->StmtTopologicalOrder()) {
      CHECK(node->IsStmt());
@@ -597,17 +597,40 @@ class XPUMultiEncoderFuser {
        stack = node;
      }
    }
+    if (stack) {
+      std::unordered_set<const Node*> to_remove2;
      Node* stack_out = stack->outlinks.front();
-    for (Node* cast : stack_out->outlinks) {
+      // avoid modification while traversing
+      auto stack_out_outlinks = stack_out->outlinks;
+      for (Node* cast : stack_out_outlinks) {
+        if (cast->stmt()->op_info()->Type() != "cast") {
+          continue;
+        }
        Node* cast_out = cast->outlinks.front();
        if (cast_out->outlinks.size() == 0) {
-        // remove
+          // dangling cast
+          to_remove2.insert(cast);
          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
+        } else if (cast_out->outlinks.size() == 1) {
+          // useless cast
          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
+          auto* multi_encoder = cast_out->outlinks.front();
+          DirectedLink(stack_out, multi_encoder);
+          UpdateInputs(multi_encoder->stmt()->op().get(),
+                       cast_out->arg()->name,
+                       stack_out->arg()->name);
+          auto update_op_info = *multi_encoder->stmt()->op_info();
+          multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
        }
      }
      GraphSafeRemoveNodes(graph, to_remove2);
    }
+  }
 };
 }  // namespace fusion

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
  auto conv_weight_t =
      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
+  auto groups = conv_op_desc->GetAttr<int>("groups");
+  bool depthwise = false;
  if (conv_type_ == "conv2d_transpose") {
+    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-             static_cast<size_t>(conv_weight_t->dims()[1]))
+             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
        << "The BN bias's size should be equal to the size of the first "
        << "dim size of the conv weights";
  } else {
@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
    // compute new conv_weight for int8
    auto weight_scale =
        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];

--- a/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
+    fusion::ScaleActivationFuser fuser(act_type);
+    fuser(graph.get());
+  }
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
+                  paddle::lite::mir::ScaleActivationFusePass)
+    .BindTargets({TARGET(kARM)})
+    .BindKernel("scale");
--- a/lite/core/mir/fusion/scale_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+class ScaleActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.cc
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include <memory>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+void ScaleActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
+  // create op nodes
+  auto* scale =
+      OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+  // create intermediate nodes
+  auto* scale_out = VarNode("scale_out")
+                        ->assert_is_op_output("scale", "Out")
+                        ->assert_is_op_input(act_type_, "X")
+                        ->AsIntermediate();
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+  // create topology.
+  *x >> *scale >> *scale_out;
+  *scale_out >> *act >> *out;
+}
+void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
+                                         const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto scale_op = LiteOpRegistry::Global().Create("scale");
+  auto scale = matched.at("scale")->stmt()->op();
+  auto* scope = scale->scope();
+  auto& valid_places = scale->valid_places();
+  scale_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+  op_desc.SetAttr("activation_type", act_type_);
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("alpha", alpha);
+  } else if (act_type_ == "leaky_relu") {
+    float alpha = act_op_desc.GetAttr<float>("alpha");
+    op_desc.SetAttr("alpha", alpha);
+  }
+  return op_desc;
+}
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.h
+++ b/lite/core/mir/fusion/scale_activation_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class ScaleActivationFuser : public FuseBase {
+ public:
+  explicit ScaleActivationFuser(const std::string& act_type) {
+    act_type_ = act_type;
+  }
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+};
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -25,16 +25,16 @@ namespace lite {
 bool OpLite::InferShape() {
  // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
  // InferShapeByMemoryInternal will be applied.
-  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+  if (op_param_ && op_param_->input_tensor_ptrs() &&
+      op_param_->output_tensor_ptrs()) {
    return this->InferShapeWithCache();
  } else {
-    // otherwise, InferShapeImpl is applied directly.
    return this->InferShapeImpl();
  }
 }
 bool OpLite::InferShapeWithCache() {
  // 1. Get vector of current input tensors
-  auto *current_inputs = param_.input_tensor_ptrs();
+  auto *current_inputs = op_param_->input_tensor_ptrs();
  // 2. Get hash value of current inputs shape and lod
  size_t new_hash = 0;
  for (auto iter = current_inputs->begin(); iter != current_inputs->end();
@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() {
  if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
    // if current hash value is consistent with io_shape_lod_hash_,
    // previous outputs shape and lod are reused.
-    auto *current_outputs = param_.output_tensor_ptrs();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
    for (size_t i = 0; i < current_outputs->size(); i++) {
      current_outputs->at(i)->Resize(last_output_shapes[i]);
      current_outputs->at(i)->set_lod(last_output_lods[i]);
@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
    // otherwise, current hash value is changed, InferShapeImpl will apply.
    io_shape_lod_hash_ = new_hash;
    this->InferShapeImpl();
-    auto *current_outputs = param_.output_tensor_ptrs();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
+    last_output_shapes.clear();
+    last_output_lods.clear();
    for (size_t i = 0; i < current_outputs->size(); i++) {
-      last_output_shapes[i] = current_outputs->at(i)->dims();
+      last_output_shapes.push_back(current_outputs->at(i)->dims());
-      last_output_lods[i] = current_outputs->at(i)->lod();
+      last_output_lods.push_back(current_outputs->at(i)->lod());
    }
  }
  return true;

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -77,6 +77,11 @@ class OpLite : public Registry {
  // Link the external execution environ to internal context.
  bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
+  template <typename T>
+  inline void AttachParam(T *param) {
+    op_param_ = static_cast<T *>(param);
+  }
  const OpInfo *op_info() const { return op_info_.get(); }
  OpInfo *mutable_op_info() { return op_info_.get(); }
@@ -167,11 +172,10 @@ class OpLite : public Registry {
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
  std::vector<DDimLite> last_output_shapes{};
  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
  size_t io_shape_lod_hash_{};
-  mutable operators::ParamBase param_;
+  mutable operators::ParamBase *op_param_{nullptr};
 private:
  // Infer Shape according to memory, if current input shapes are consistent

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -111,18 +111,23 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNHWC)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
@@ -141,9 +146,7 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kInt64),
                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kARM),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -71,12 +71,17 @@ class Optimizer {
           "identity_scale_eliminate_pass",               //
           "elementwise_mul_constant_eliminate_pass",     //
           "lite_sequence_pool_concat_fuse_pass",         //
+           "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
    (defined LITE_WITH_ARM)
           "lite_elementwise_add_activation_fuse_pass",  //
 #endif
           "__xpu__resnet_fuse_pass",
           "__xpu__multi_encoder_fuse_pass",
+           "__xpu__embedding_with_eltwise_add_fuse_pass",
+           "__xpu__fc_fuse_pass",
+           "identity_dropout_eliminate_pass",         // should be placed after
+                                                      // xpu fusion
           "quantized_op_attributes_inference_pass",  // Only for fully
                                                      // quantized model, infer
                                                      // the output scale and

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
+#include "lite/fluid/float16.h"
 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_image_converter.h"
@@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
  return true;
 }
+static bool write_precision_summary_tofile(const std::string& string,
+                                           const std::string& log_dir = "") {
+  if (log_dir == "") {
+    LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
+              << log_dir;
+    return false;
+  }
+  FILE* fp = fopen(log_dir.c_str(), "a");
+  if (fp == nullptr) {
+    LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
+    return false;
+  } else {
+    fprintf(fp, "%s\n", string.c_str());
+  }
+  fclose(fp);
+  return true;
+}
 class PrecisionProfiler {
 public:
  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
@@ -67,7 +86,7 @@ class PrecisionProfiler {
    using std::left;
    using std::fixed;
    STL::stringstream ss;
-    ss << "========================================= "
+    ss << "\n\n========================================= "
       << "Detailed Precision Profiler Summary "
       << "=========================================" << std::endl;
    ss << setw(45) << left << "operator:(kernel_info)"
@@ -77,6 +96,13 @@ class PrecisionProfiler {
       << " " << setw(15) << left << "std_deviation"
       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
+    // write to file with path: `log_dir`
+    if (log_dir_ != "") {
+      FILE* fp = fopen(log_dir_.c_str(), "a");
+      std::string header_str{ss.str()};
+      fprintf(fp, "%s\n", header_str.c_str());
+      fclose(fp);
+    }
    return ss.str();
  }
@@ -194,6 +220,7 @@ class PrecisionProfiler {
      }
 #ifdef LITE_WITH_OPENCL
    } else if (target_type == TARGET(kOpenCL)) {
+      CLRuntime::Global()->command_queue().finish();
      switch (layout_type) {
        case DATALAYOUT(kImageDefault): {
          paddle::lite::CLImageConverterDefault default_convertor;
@@ -360,8 +387,12 @@ class PrecisionProfiler {
        }
      }
    }
+    write_precision_summary_tofile(ss.str(), log_dir_);
    return ss.str();
  }
+ private:
+  std::string log_dir_{"/storage/emulated/0/precision.log"};
 };
 }  // namespace profile

--- a/lite/core/scope.cc
+++ b/lite/core/scope.cc
@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
  return nullptr;
 }
+// AttributeVarNames will get persistive attribute names stored in parent scope
+std::vector<std::string> Scope::AttributeVarNames() const {
+  std::vector<std::string> resulted_keys;
+  const Scope *cur_scope = this;
+  while (cur_scope->parent()) {
+    cur_scope = cur_scope->parent();
+    auto keys = cur_scope->LocalVarNames();
+    resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
+  }
+  // remove feed and fetch
+  std::vector<std::string> skiped_vars = {"feed", "fetch"};
+  for (int i = 0; i < skiped_vars.size(); i++) {
+    auto iter =
+        std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    while (iter != resulted_keys.end()) {
+      resulted_keys.erase(iter);
+      iter =
+          std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    }
+  }
+  return resulted_keys;
+}
 std::vector<std::string> Scope::LocalVarNames() const {
  std::vector<std::string> keys;
  for (const auto &item : vars_) {

--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -45,6 +45,8 @@ class Scope final {
  const Scope* parent() const { return parent_; }
+  // Get attribute params stored in parent scopes.
+  std::vector<std::string> AttributeVarNames() const;
  // Following the legacy scope interface.
  std::vector<std::string> LocalVarNames() const;

--- a/lite/demo/cxx/cuda_demo/CMakeLists.txt
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
-project(demo CXX C)
 cmake_minimum_required(VERSION 2.8)
+project(demo CXX C)
+add_definitions(-DLITE_WITH_CUDA)
 set(TARGET demo)
 set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
-set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
+set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx")
-set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
+set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
-include_directories("${LITE_LIB}/include")
+include_directories("${LITE_ROOT}/include")
-link_directories("${LITE_LIB}/lib")
+link_directories("${LITE_ROOT}/lib")
-link_directories("${PROTOBUF_LIB}/lib")
+link_directories("${PROTOBUF_ROOT}/lib")
+# cuda lib
+link_directories("/usr/local/cuda/lib64/")
 add_executable(${TARGET} ${TARGET}.cc)
-set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
+set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so)
 set(DEPS ${DEPS} protobuf-lite)
-set(DEPS ${DEPS} "-lrt -lpthread -ldl")
+set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart")
 target_link_libraries(${TARGET} ${DEPS})
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+include ../Makefile.def
+LITE_ROOT=../../../
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+include ../Makefile.def
+LITE_ROOT=../../../
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
--- a/lite/demo/cxx/test_libs/classification_full.cc
+++ b/lite/demo/cxx/test_libs/classification_full.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
+              "",
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
+              "",
+              "the filename of param file, set param_file when the model is "
+              "combined formate.");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor");
+// Optimize model for ARM CPU.
+// If the model is not combined, set model_filename and params_filename as empty
+void OptModel(const std::string& load_model_dir,
+              const std::string& model_filename,
+              const std::string& params_filename,
+              const std::string& save_model_path) {
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  if (!model_filename.empty() && !params_filename.empty()) {
+    config.set_model_file(load_model_dir + "/" + model_filename);
+    config.set_param_file(load_model_dir + "/" + params_filename);
+  }
+  std::vector<paddle::lite_api::Place> vaild_places = {
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
+  };
+  config.set_valid_places(vaild_places);
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+  std::string cmd_str = "rm -rf " + save_model_path;
+  int ret = system(cmd_str.c_str());
+  if (ret == 0) {
+    std::cout << "Delete old optimized model " << save_model_path << std::endl;
+  }
+  predictor->SaveOptimizedModel(save_model_path,
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  std::cout << "Load model from " << load_model_dir << std::endl;
+  std::cout << "Save optimized model to " << save_model_path << std::endl;
+}
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+  predictor->Run();
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test.---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test.---------- \n\n";
+  }
+}
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--model_dir: the path of not optimized model \n"
+           "--model_filename: the model filename of not optimized model \n"
+           "--param_filename: the param filename of not optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+  const int height = 224;
+  const int width = 224;
+  std::string model_dir = FLAGS_model_dir;
+  if (model_dir.back() == '/') {
+    model_dir.pop_back();
+  }
+  std::string optimized_model_path = model_dir + "_opt2";
+  OptModel(FLAGS_model_dir,
+           FLAGS_model_filename,
+           FLAGS_param_filename,
+           optimized_model_path);
+  std::string run_model_path = optimized_model_path + ".nb";
+  // Run test
+  Run(run_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/classification_light.cc
+++ b/lite/demo/cxx/test_libs/classification_light.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+DEFINE_string(optimized_model_path, "", "the path of optimized model");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor");
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+  predictor->Run();
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test---------- \n\n";
+  }
+}
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_optimized_model_path.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--optimized_model_path: the path of optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+  const int height = 224;
+  const int width = 224;
+  // Run test
+  Run(FLAGS_optimized_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/prepare.sh
+++ b/lite/demo/cxx/test_libs/prepare.sh
+make clean
+make all -j
+gf=test_lite_lib_files
+if [ -d ${gf} ];then
+    rm -rf ${gf}
+fi
+mkdir ${gf}
+mv classification_full_shared ${gf}
+mv classification_full_static ${gf}
+mv classification_light_shared ${gf}
+mv classification_light_static ${gf}
+mv yolov3_full_shared ${gf}
+mv yolov3_full_static ${gf}
+mv yolov3_light_shared ${gf}
+mv yolov3_light_static ${gf}
+cp run.sh ${gf}
+make clean
+cp -r ../../../cxx/ ${gf}
+mv ${gf}/cxx ${gf}/lite
+if [ ! -f "test_libs_models_imgs.tgz" ];then
+    wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz
+fi
+tar zxvf test_libs_models_imgs.tgz
+mv test_libs_models_imgs ${gf}
+mv ${gf}/test_libs_models_imgs ${gf}/models_imgs
--- a/lite/demo/cxx/test_libs/run.sh
+++ b/lite/demo/cxx/test_libs/run.sh
+export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH}
+# mobilenetv1
+./classification_light_shared \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+./classification_light_static \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+./classification_full_static \
+    --model_dir=models_imgs/models/mobilenetv1 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+./classification_full_shared \
+    --model_dir=models_imgs/models/mobilenetv1 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+# mobilenetv2
+./classification_light_shared \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+./classification_light_static \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+./classification_full_static \
+    --model_dir=models_imgs/models/mobilenetv2 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+./classification_full_shared \
+    --model_dir=models_imgs/models/mobilenetv2 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+# yolov3
+./yolov3_light_shared \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb  \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+./yolov3_light_static \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+./yolov3_full_static \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1 \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+./yolov3_full_shared \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1 \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
--- a/lite/demo/cxx/test_libs/test_helper.cc
+++ b/lite/demo/cxx/test_libs/test_helper.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "test_helper.h"  // NOLINT
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str) {
+  std::vector<int64_t> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    int num = atoi(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+std::vector<double> GetDoubleNumsFromStr(const std::string& str) {
+  std::vector<double> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    double num = atof(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) / scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) / scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) / scale[2];
+  }
+}
+// Process img and set it as input
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dest_data,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, dest_data, width * height, means, scales);
+}
--- a/lite/demo/cxx/test_libs/test_helper.h
+++ b/lite/demo/cxx/test_libs/test_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+double GetCurrentUS();
+int64_t ShapeProduction(const std::vector<int64_t>& shape);
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str);
+std::vector<double> GetDoubleNumsFromStr(const std::string& str);
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale);
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dst_data,
+                 float* means,
+                 float* scales);
--- a/lite/demo/cxx/test_libs/yolov3_full.cc
+++ b/lite/demo/cxx/test_libs/yolov3_full.cc
--- a/lite/demo/cxx/test_libs/yolov3_light.cc
+++ b/lite/demo/cxx/test_libs/yolov3_light.cc
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/arm/assign_compute.h
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
--- a/lite/kernels/arm/logical_compute.cc
+++ b/lite/kernels/arm/logical_compute.cc
--- a/lite/kernels/arm/logical_compute.h
+++ b/lite/kernels/arm/logical_compute.h
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ b/lite/kernels/arm/read_from_array_compute.cc
--- a/lite/kernels/arm/read_from_array_compute.h
+++ b/lite/kernels/arm/read_from_array_compute.h
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
--- a/lite/kernels/opencl/conv_buffer_compute_test.cc
+++ b/lite/kernels/opencl/conv_buffer_compute_test.cc
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
--- a/lite/kernels/opencl/dropout_image_compute_test.cc
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
--- a/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute_test.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/layout_image_compute_test.cc
+++ b/lite/kernels/opencl/layout_image_compute_test.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
--- a/lite/kernels/opencl/mul_buffer_compute_test.cc
+++ b/lite/kernels/opencl/mul_buffer_compute_test.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/pool_image_compute_test.cc
+++ b/lite/kernels/opencl/pool_image_compute_test.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute_test.cc
+++ b/lite/kernels/opencl/reshape_image_compute_test.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute_test.cc
+++ b/lite/kernels/opencl/scale_image_compute_test.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
--- a/lite/kernels/x86/sequence_reshape_compute.cc
+++ b/lite/kernels/x86/sequence_reshape_compute.cc
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/arm/assign_compute.cc
--- a/lite/kernels/xpu/__xpu__fc_compute.cc
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
--- a/lite/kernels/xpu/__xpu__fc_compute.h
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/utils.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.h
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.h
--- a/lite/operators/__xpu__fc_op.cc
+++ b/lite/operators/__xpu__fc_op.cc
--- a/lite/operators/__xpu__fc_op.h
+++ b/lite/operators/__xpu__fc_op.h
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
--- a/lite/tests/kernels/logical_compute_test.cc
+++ b/lite/tests/kernels/logical_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ b/lite/tests/kernels/read_from_array_compute_test.cc
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ b/lite/tests/kernels/write_to_array_compute_test.cc
--- a/build.bat
+++ b/build.bat
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/untar.py
+++ b/lite/tools/untar.py
--- a/lite/utils/CMakeLists.txt
+++ b/lite/utils/CMakeLists.txt
--- a/lite/utils/cp_logging.h
+++ b/lite/utils/cp_logging.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ b/mobile/src/operators/bilinear_interp_op.cpp
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_inference_imfix.cpp
+++ b/mobile/test/net/test_inference_imfix.cpp
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ b/mobile/test/net/test_inference_m2fm.cpp
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ b/mobile/test/net/test_mobilenet_GPU.cpp
--- a/mobile/tools/build_android_armv7.sh
+++ b/mobile/tools/build_android_armv7.sh
--- a/mobile/tools/build_android_armv8.sh
+++ b/mobile/tools/build_android_armv8.sh
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake