Merge branch 'develop' into baolei/bitmain

e7d31e1f · Santa An · GitHub · 3b3abcbb · f3aad572 · e7d31e1f
217 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -63,6 +63,16 @@ test/models/
 test/images/
+*.pyc
+# model
+*.nb
+*.svg
+*.dot
+# vim intermediate files
+*.swp
 # Emacs intermediate files
 *~

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -45,7 +45,7 @@ else()
        # we changed the source code to adapt for windows compiling
        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
        ######################################################################################################
-        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        URL             http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}

--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
 4、 `demo` 文件夹：c++ demo.
-如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。
 ## 运行
@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
 二： 运行   
-**NOTE:**此处示例使用的是python接口。
+**NOTE：** 此处示例使用的是python接口。
 ``` python
 #-*- coding: utf-8 -*-
@@ -75,7 +75,7 @@ import sys
 import numpy as np
 import cv2
 sys.path.append('build_cuda/inference_lite_lib/python/lib')
-from lite_core import *
+from lite import *
 def read_img(im_path, resize_h, resize_w):
  im = cv2.imread(im_path).astype('float32')

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_SHUTDOWN_LOG)
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
  lite_cc_library(place SRCS paddle_place.cc DEPS glog)

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
+// get param names
+std::vector<std::string> Predictor::GetParamNames() {
+  return exec_scope_->AttributeVarNames();
+}
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
  if (!program_) {
@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
+    if (valid_place.target == TARGET(kOpenCL)) continue;
    inner_places.emplace_back(
        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
  }
@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {
 const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
  return &var->Get<lite::Tensor>();
 }
+lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
+  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
+  return var->GetMutable<lite::Tensor>();
+}
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  auto element = std::find(input_names_.begin(), input_names_.end(), name);

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -85,6 +85,9 @@ class LITE_API Predictor {
  // get inputnames and get outputnames.
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
+  // get param names
+  std::vector<std::string> GetParamNames();
  void PrepareFeedFetch();
  // Get offset-th col of fetch results.
@@ -92,6 +95,9 @@ class LITE_API Predictor {
  std::vector<const lite::Tensor*> GetOutputs() const;
  const cpp::ProgramDesc& program_desc() const;
+  // get a mutable tensor according to its name
+  lite::Tensor* GetMutableTensor(const std::string& name);
+  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;
@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
  // get inputs names and get outputs names
  std::vector<std::string> GetInputNames() override;
  std::vector<std::string> GetOutputNames() override;
+  // get param names
+  std::vector<std::string> GetParamNames() override;
+  // get tensor according to tensor's name
  std::unique_ptr<const lite_api::Tensor> GetTensor(
      const std::string& name) const override;
+  // get a mutable tensor according to tensor's name
+  std::unique_ptr<lite_api::Tensor> GetMutableTensor(
+      const std::string& name) override;
  // Get InputTebsor by name
  std::unique_ptr<lite_api::Tensor> GetInputByName(

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
  return raw_predictor_.GetInputNames();
 }
+std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
+  return raw_predictor_.GetParamNames();
+}
 std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
  return raw_predictor_.GetOutputNames();
 }
@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
  return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
 }
+std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
+    const std::string &name) {
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
+}
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
    const std::string &name) {
  return std::unique_ptr<lite_api::Tensor>(

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
 DEFINE_string(input_shape_0,
              "1,3,224,224",
              "input shapes another, separated by colon and comma");
+DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
-  config.set_valid_places({
+  if (FLAGS_target == "arm") {
-      Place{TARGET(kARM), PRECISION(kFloat)},
+    config.set_valid_places({
-  });
+        Place{TARGET(kARM), PRECISION(kFloat)},
+    });
+  } else if (FLAGS_target == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        Place{TARGET(kARM)},  // enable kARM CPU kernel when no opencl kernel
+    });
+  }
  auto predictor = lite_api::CreatePaddlePredictor(config);
  // delete old optimized model
@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         int tid,
         const int warmup_times = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);
@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);
@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);
  auto predictor = lite_api::CreatePaddlePredictor(config);
-  config.set_model_dir(model_dir_0);
+  config.set_model_from_file(model_dir_0 + ".nb");
  auto predictor_0 = lite_api::CreatePaddlePredictor(config);
  for (int i = 0; i < 2 * repeat; i += 2) {
@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  if (FLAGS_model_dir == "") {
    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model";
+              << "--model_dir /path/to/your/model --model_dir_0 "
+                 "/path/to/your/model0  --target `arm` or `opencl`";
    exit(0);
  }
  std::string save_optimized_model_dir = "";

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
    optimize_out_type,
-    "protobuf",
+    "naive_buffer",
    "store type of the output optimized model. protobuf/naive_buffer");
 DEFINE_bool(display_kernels, false, "Display kernel information");
 DEFINE_bool(record_tailoring_info,
@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
  }
  std::cout << std::setiosflags(std::ios::internal);
  std::cout << std::setw(maximum_optype_length) << "OP_name";
-  for (int i = 0; i < targets.size(); i++) {
+  for (size_t i = 0; i < targets.size(); i++) {
    std::cout << std::setw(10) << targets[i].substr(1);
  }
  std::cout << std::endl;
@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
    for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
      std::cout << std::setw(maximum_optype_length) << it->first;
      auto ops_valid_places = it->second;
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
        if (std::find(ops_valid_places.begin(),
                      ops_valid_places.end(),
                      targets[i]) != ops_valid_places.end()) {
@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
      }
      // Print OP info.
      auto ops_valid_places = supported_ops.at(*op);
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
        if (std::find(ops_valid_places.begin(),
                      ops_valid_places.end(),
                      targets[i]) != ops_valid_places.end()) {
@@ -288,11 +288,11 @@ void ParseInputCommand() {
    auto valid_places = paddle::lite_api::ParserValidPlaces();
    // get valid_targets string
    std::vector<TargetType> target_types = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
      target_types.push_back(valid_places[i].target);
    }
    std::string targets_str = TargetToStr(target_types[0]);
-    for (int i = 1; i < target_types.size(); i++) {
+    for (size_t i = 1; i < target_types.size(); i++) {
      targets_str = targets_str + TargetToStr(target_types[i]);
    }
@@ -301,7 +301,7 @@ void ParseInputCommand() {
    target_types.push_back(TARGET(kUnk));
    std::set<std::string> valid_ops;
-    for (int i = 0; i < target_types.size(); i++) {
+    for (size_t i = 0; i < target_types.size(); i++) {
      auto ops = supported_ops_target[static_cast<int>(target_types[i])];
      valid_ops.insert(ops.begin(), ops.end());
    }
@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
  valid_ops.insert(
      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
-  for (int i = 0; i < valid_places.size(); i++) {
+  for (size_t i = 0; i < valid_places.size(); i++) {
    auto target = valid_places[i].target;
    auto ops = supported_ops_target[static_cast<int>(target)];
    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
@@ -340,7 +340,7 @@ void CheckIfModelSupported() {
  std::set<std::string> unsupported_ops;
  std::set<std::string> input_model_ops;
-  for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
    }
    std::vector<TargetType> targets = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
      targets.push_back(valid_places[i].target);
    }
    std::sort(targets.begin(), targets.end());
    targets.erase(unique(targets.begin(), targets.end()), targets.end());
    std::string targets_str = TargetToStr(targets[0]);
-    for (int i = 1; i < targets.size(); i++) {
+    for (size_t i = 1; i < targets.size(); i++) {
      targets_str = targets_str + "," + TargetToStr(targets[i]);
    }

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
         "command argument 'valid_targets'";
 }
-void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
+void OptBase::SetLiteOut(const std::string& lite_out_name) {
-  optimize_out_path_ = optimized_out_path;
+  lite_out_name_ = lite_out_name;
 }
-void OptBase::RunOptimize(bool record_strip_info) {
+void OptBase::RecordModelInfo(bool record_strip_info) {
+  record_strip_info_ = record_strip_info;
+}
+void OptBase::Run() {
  CheckIfModelSupported(false);
  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
  opt_config_.set_valid_places(valid_places_);
  if (model_set_dir_ != "") {
-    RunOptimizeFromModelSet(record_strip_info);
+    RunOptimizeFromModelSet(record_strip_info_);
  } else {
    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
    opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info_);
    auto resulted_model_name =
-        record_strip_info ? "information of striped model" : "optimized model";
+        record_strip_info_ ? "information of striped model" : "optimized model";
    std::cout << "Save the " << resulted_model_name
-              << " into :" << optimize_out_path_ << "successfully";
+              << " into :" << lite_out_name_ << "successfully";
  }
 }
+void OptBase::RunOptimize(const std::string& model_dir_path,
+                          const std::string& model_path,
+                          const std::string& param_path,
+                          const std::string& valid_places,
+                          const std::string& optimized_out_path) {
+  SetModelDir(model_dir_path);
+  SetModelFile(model_path);
+  SetParamFile(param_path);
+  SetValidPlaces(valid_places);
+  SetLiteOut(optimized_out_path);
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info_);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        lite_out_name_, model_type_, record_strip_info_);
+    auto resulted_model_name =
+        record_strip_info_ ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << lite_out_name_ << "successfully";
+  }
+}
 // collect ops info of modelset
 void CollectModelMetaInfo(const std::string& output_dir,
                          const std::vector<std::string>& models,
@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
 }
 void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
  // 1. mkdir of outputed optimized model set.
-  lite::MkDirRecur(optimize_out_path_);
+  lite::MkDirRecur(lite_out_name_);
  auto model_dirs = lite::ListDir(model_set_dir_, true);
  if (model_dirs.size() == 0) {
    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
    std::string input_model_dir =
        lite::Join<std::string>({model_set_dir_, name}, "/");
    std::string output_model_dir =
-        lite::Join<std::string>({optimize_out_path_, name}, "/");
+        lite::Join<std::string>({lite_out_name_, name}, "/");
    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
      auto model_file_path =
@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
    opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info);
    std::cout << "Optimize done. ";
  }
@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
  if (record_strip_info) {
    // Collect all models information
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
-    CollectModelMetaInfo(optimize_out_path_,
-                         model_dirs,
-                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
    std::cout << "Record the information of stripped models into :"
-              << optimize_out_path_ << "successfully";
+              << lite_out_name_ << "successfully";
  }
 }
 void OptBase::PrintHelpInfo() {
  const std::string opt_version = lite::version();
  const char help_info[] =
-      "At least one argument should be inputed. Valid arguments are listed "
+      "------------------------------------------------------------------------"
-      "below:\n"
+      "-----------------------------------------------------------\n"
+      "  Valid arguments of Paddle-Lite opt are listed below:\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n"
      "  Arguments of help information:\n"
      "        `help()`   Print help infomation\n"
-      "  Arguments of model optimization:\n"
+      "\n"
+      "  Arguments of model transformation:\n"
      "        `set_model_dir(model_dir)`\n"
      "        `set_model_file(model_file_path)`\n"
      "        `set_param_file(param_file_path)`\n"
-      "        `set_model_type(protobuf|naive_buffer)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
-      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "default\n"
+      "        `set_lite_out(output_optimize_model_dir)`\n"
      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
-      "        `run_optimize(false|true)`\n"
+      "        `record_model_info(false|true)`: refer to whether to record ops "
-      "        `  ----fasle&true refer to whether to record ops info for "
+      "info for striping lib, false by default`\n"
-      "tailoring lib, false by default`\n"
+      "        `run() : start model transformation`\n"
-      "  Arguments of model checking and ops information:\n"
+      "    eg. `opt.set_model_dir(\"./mobilenetv1\"); "
+      "opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
+      "opt.run();`\n"
+      "\n"
+      "  You can also transform model through a single input argument:\n"
+      "        `run_optimize(model_dir, model_file_path, param_file_path, "
+      "model_type, valid_places, lite_out_name) `\n"
+      "    eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
+      "\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
+      "\n"
+      "  Arguments of checking model and printing ops information:\n"
      "        `print_all_ops()`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `print_supported_ops`   Display supported operators of valid "
      "places\n"
      "        `check_if_model_supported()`   Check if the input model is "
-      "supported\n";
+      "supported\n"
+      "------------------------------------------------------------------------"
-  std::cout << "opt version:" << opt_version << std::endl
+      "-----------------------------------------------------------\n";
-            << help_info << std::endl;
+  std::cout << "opt version:" << opt_version << std::endl << help_info;
 }
 // 2. Print supported info of inputed ops
 void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {

--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
@@ -44,16 +44,21 @@ class LITE_API OptBase {
 public:
  OptBase() = default;
  void SetModelSetDir(const std::string &model_set_path);
-  void SetModelDir(const std::string &model_path);
+  void SetModelDir(const std::string &model_dir_path);
  void SetModelFile(const std::string &model_path);
  void SetParamFile(const std::string &param_path);
  void SetValidPlaces(const std::string &valid_places);
-  void SetOptimizeOut(const std::string &optimized_out_path);
+  void SetLiteOut(const std::string &lite_out_name);
+  void RecordModelInfo(bool record_strip_info = true);
  // set optimized_model type
  void SetModelType(std::string model_type);
  // transform and save the optimized model
-  void RunOptimize(bool record_strip_info = false);
+  void Run();
+  void RunOptimize(const std::string &model_dir_path = "",
+                   const std::string &model_path = "",
+                   const std::string &param_path = "",
+                   const std::string &valid_places = "",
+                   const std::string &optimized_out_path = "");
  // fuctions of printing info
  // 1. help info
  void PrintHelpInfo();
@@ -71,12 +76,12 @@ class LITE_API OptBase {
  // valid places for the optimized_model
  std::vector<Place> valid_places_;
  // filename of the optimized_model
-  std::string optimize_out_path_;
+  std::string lite_out_name_;
  // type of the optimized_model, kNaiveBuffer default.
  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
  // Dir path of a set of models, this should be combined with model
  std::string model_set_dir_;
+  bool record_strip_info_{false};
  void RunOptimizeFromModelSet(bool record_strip_info = false);
 };

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
+std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
+    const std::string &name) {
+  LOG(FATAL)
+      << "The GetMutableTensor API is only supported by CxxConfig predictor.";
+  return nullptr;
+}
+std::vector<std::string> PaddlePredictor::GetParamNames() {
+  std::vector<std::string> null_result = {};
+  LOG(FATAL)
+      << "The GetParamNames API is only supported by CxxConfig predictor.";
+  return null_result;
+}
 void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
                                         LiteModelType model_type,
                                         bool record_info) {

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
  virtual std::vector<std::string> GetInputNames() = 0;
  // Get output names
  virtual std::vector<std::string> GetOutputNames() = 0;
+  // Get output names
+  virtual std::vector<std::string> GetParamNames();
  // Get Input by name
  virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
  /// Get a readonly tensor, return null if no one called `name` exists.
  virtual std::unique_ptr<const Tensor> GetTensor(
      const std::string& name) const = 0;
+  /// Get a mutable tensor, return null if on one called `name` exists
+  /// internal infereces API, not recommanded.
+  virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
  /// Persist the optimized model to disk. This API is only supported by
  /// CxxConfig, and the persisted model can be reused for MobileConfig.
@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
 #endif
 #ifdef LITE_WITH_CUDA
  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
-  int multi_stream() const { return multi_stream_; }
+  bool multi_stream() const { return multi_stream_; }
 #endif
 #ifdef LITE_WITH_MLU
@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
  // current thread.
  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
  // XPU only, specify the target device ID for the current thread.
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
  void set_xpu_dev_per_thread(int dev_no = 0);
 };

--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -19,7 +19,13 @@
 #pragma once
 // some platform-independent defintion
-#include "lite/utils/macros.h"
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
 #define USE_LITE_OP(op_type__)       \
  extern int touch_op_##op_type__(); \

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
 USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
+USE_MIR_PASS(identity_dropout_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
 USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
+USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
+USE_MIR_PASS(__xpu__fc_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
      .def("set_model_file", &OptBase::SetModelFile)
      .def("set_param_file", &OptBase::SetParamFile)
      .def("set_valid_places", &OptBase::SetValidPlaces)
-      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_lite_out", &OptBase::SetLiteOut)
      .def("set_model_type", &OptBase::SetModelType)
+      .def("record_model_info", &OptBase::RecordModelInfo)
+      .def("run", &OptBase::Run)
      .def("run_optimize", &OptBase::RunOptimize)
      .def("help", &OptBase::PrintHelpInfo)
      .def("print_supported_ops", &OptBase::PrintSupportedOps)

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON':
 # link lite.so to paddlelite.libs
 if os.name != 'nt':
    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-    /inference_lite_lib/python/install/lite/lite.so"
+/inference_lite_lib/python/install/lite/lite.so"
    if os.system(COMMAND) != 0:
        raise Exception("patch third_party libs failed, command: %s" % COMMAND)

--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
+  const int pad_h0 = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
  int tile_h = (hout + 5) / 6;
  int size_tile = tile_h * tile_w;
-  int w_pad = win + pad_w * 2;
+  int w_pad = win + pad_w0 + pad_w1;
-  int h_pad = hin + pad_h * 2;
+  int h_pad = hin + pad_h0 + pad_h1;
  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
+                             -pad_h0,
-                             hin + pad_h,
+                             hin + pad_h1,
-                             -pad_w,
+                             -pad_w0,
-                             win + pad_w,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
+  const int pad_h0 = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;
-  int w_pad = win + pad_w * 2;
+  int w_pad = win + pad_w0 + pad_w1;
-  int h_pad = hin + pad_h * 2;
+  int h_pad = hin + pad_h0 + pad_h1;
  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
+                             -pad_h0,
-                             hin + pad_h,
+                             hin + pad_h1,
-                             -pad_w,
+                             -pad_w0,
-                             win + pad_w,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
                                const operators::ConvParam& param,
                                ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
+  const int pad_h0 = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;
-  int w_pad = win + pad_w * 2;
+  int w_pad = win + pad_w0 + pad_w1;
-  int h_pad = hin + pad_h * 2;
+  int h_pad = hin + pad_h0 + pad_h1;
  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
+                             -pad_h0,
-                             hin + pad_h,
+                             hin + pad_h1,
-                             -pad_w,
+                             -pad_w0,
-                             win + pad_w,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,

--- a/lite/backends/arm/math/lstm.cc
+++ b/lite/backends/arm/math/lstm.cc
@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
    for (int w = start_w; w < w_adds; ++w) {
      i_data[w] += b_data[w];
    }
+    i_data += width;
  }
 }
 void vector_dot(
@@ -67,15 +68,8 @@ void vector_dot(
  for (int i = 0; i < remain; ++i) {
    if (!v2) {
      out_ptr[i] = in_ptr[i] * v1_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
    } else {
      out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
-      ++v2_ptr;
    }
  }
 }

--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -21,6 +21,17 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
 void pooling_basic(const float* din,
                   float* dout,
                   int num,
@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
 #pragma omp parallel for
      for (int ind_c = 0; ind_c < chin; ++ind_c) {
        for (int ind_h = 0; ind_h < hout; ++ind_h) {
-          int sh = ind_h * stride_h;
+          int sh, eh;
-          int eh = sh + kernel_h;
+          if (adaptive) {
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+            sh = AdaptStartIndex(ind_h, hin, hout);
-          eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+            eh = AdaptEndIndex(ind_h, hin, hout);
+          } else {
+            sh = ind_h * stride_h;
+            eh = sh + kernel_h;
+            sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+            eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          }
          for (int ind_w = 0; ind_w < wout; ++ind_w) {
-            int sw = ind_w * stride_w;
+            int sw, ew;
-            int ew = sw + kernel_w;
+            if (adaptive) {
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+              sw = AdaptStartIndex(ind_w, win, wout);
-            ew = (ew - pad_w) > win ? win : ew - pad_w;
+              ew = AdaptEndIndex(ind_w, win, wout);
+            } else {
+              sw = ind_w * stride_w;
+              ew = sw + kernel_w;
+              sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+              ew = (ew - pad_w) > win ? win : ew - pad_w;
+            }
            float result = static_cast<float>(0);
            int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
                          ind_h * wout + ind_w;

--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
 template <typename T>
 void scale(const T* din, T* dout, int num, T scale, T bias);
+template <typename T>
+void scale_relu(const T* din, T* dout, int num, T scale, T bias);
+template <typename T>
+void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
+template <typename T>
+void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
 template <typename T>
 void scale(const T* din,
           T* dout,

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,6 +28,7 @@ namespace lite {
 class CLContext {
 public:
  ~CLContext() {
+    GetCommandQueue().finish();
    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
      // Note(ysh329): Don't need `clReleaseKernel`
      kernels_[kidx].reset();

--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
  size_t width = in_image.ImageWidth();
  size_t height = in_image.ImageHeight();
  auto global_work_size = cl::NDRange{width, height};
-  cl::Event event;
  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
  CL_CHECK_FATAL(status);
  status = context->GetCommandQueue().finish();
  CL_CHECK_FATAL(status);
+#if 0
  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+#endif
  LOG(INFO) << out_image;
 }

--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
            i2 += 4;
            p++;
          } else {
-            image[i2] = 0.0;
+            image[i2] = Float2Half(0.f);
            i2 += 4;
          }
        }
@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
            image[index] = Float2Half(*p);
            p++;
          } else {
-            image[index] = 0.0;
+            image[index] = Float2Half(0.f);
          }
          if (index >= (width * height * 4)) {
            LOG(INFO) << " index out of range ";

--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 /////////////////////////////////
@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
 #endif
 #ifdef RELU6
-  output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
+  in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
 #endif
  return output;
 }
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -14,36 +14,30 @@ limitations under the License. */
 #include <cl_common.h>
 __kernel void relu(__read_only image2d_t input,
                   __write_only image2d_t output,
                   __private const float threshold,
                   __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
-  const int x = get_global_id(0); // image_width
+  const sampler_t sampler =
-  const int y = get_global_id(1); // image_height
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f), in);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void relu6(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
-                   __private const float scale){
+                    __private const float scale) {
  const int x = get_global_id(0);
  const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void sigmoid(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input,
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out;
-  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
-  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
-  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+  out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
+  out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
 __kernel void leaky_relu(__read_only image2d_t input,
-                      __write_only image2d_t output,
+                         __write_only image2d_t output,
-                      __private const float threshold,
+                         __private const float threshold,
-                      __private const float scale) {
+                         __private const float scale) {
  const int x = get_global_id(0);
  const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
-  if (in.x < 0.0f){
+  if (in.x < 0.0f) {
    in.x = s_val.x;
  }
-  if (in.y < 0.0f){
+  if (in.y < 0.0f) {
    in.y = s_val.y;
  }
-  if (in.z < 0.0f){
+  if (in.z < 0.0f) {
    in.z = s_val.z;
  }
-  if (in.w < 0.0f){
+  if (in.w < 0.0f) {
    in.w = s_val.w;
  }
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void tanh_act(__read_only image2d_t input,
-                      __write_only image2d_t output,
+                       __write_only image2d_t output,
-                      __private const float threshold,
+                       __private const float threshold,
-                      __private const float scale) {
+                       __private const float scale) {
+  const int x = get_global_id(0);  // image_width
-  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1);  // image_height
-  const int y = get_global_id(1); // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
+  CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
 __kernel void exp_act(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
-                   __private const float scale) {
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
-  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1);  // image_height
-  const int y = get_global_id(1); // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = exp(in);
@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input,
 }
 __kernel void swish(__read_only image2d_t input,
-                      __write_only image2d_t output,
+                    __write_only image2d_t output,
-                      __private const float threshold,
+                    __private const float threshold,
-                   __private const float scale) {
+                    __private const float scale) {
+  const int x = get_global_id(0);  // image_width
-  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1);  // image_height
-  const int y = get_global_id(1); // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
 #include <cl_common.h>
-__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
+__kernel void conv2d_1x1_opt(
-                         __private const int global_size_dim1,
+    __private const int global_size_dim0,
-                         __private const int global_size_dim2,
+    __private const int global_size_dim1,
-                         __read_only image2d_t input_image,
+    __private const int global_size_dim2,
-                         __read_only image2d_t filter,
+    __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
-                         __read_only image2d_t new_biase,
+    __read_only image2d_t new_biase,
 #endif
-                         __write_only image2d_t output_image,
+    __write_only image2d_t output_image,
-                         __private const int stride,
+    __private const int stride,
-                         __private const int offset,
+    __private const int offset,
-                         __private const int input_c_block,
+    __private const int input_c_block,
-                         __private const int input_c_origin,
+    __private const int input_c_origin,
-                         __private const int dilation,
+    __private const int dilation,
-                         __private const int input_width,  /* of one block */
+    __private const int input_width,  /* of one block */
-                         __private const int input_height, /* of one block */
+    __private const int input_height, /* of one block */
-                         __private const int output_width,
+    __private const int output_width,
-                         __private const int output_height,
+    __private const int output_height,
-                         __private const int old_w) {
+    __private const int old_w) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -18,7 +18,7 @@ limitations under the License. */
 ////////////////////////////////////////////////////////
 // buffer -> image2d
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d(__global CL_DTYPE *in,
+__kernel void buffer_to_image2d(__global CL_DTYPE* in,
                                __write_only image2d_t output_image,
                                __private const int out_H,
                                __private const int out_W,
@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
                                __private const int Stride0,
                                __private const int Stride1,
                                __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
 #ifdef DEBUG
  if (out_w > 2045) {
-    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
+    printf(
-		   out_w,
+        "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
-           out_C - 4 * out_c,
+        "%.2f\n",
-           (float)(in[input_pos0]),
+        out_w,
-           (float)(in[input_pos1]),
+        out_C - 4 * out_c,
-           (float)(in[input_pos2]),
+        (float)(in[input_pos0]),
-           (float)(in[input_pos3]));
+        (float)(in[input_pos1]),
-    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
+        (float)(in[input_pos2]),
-           output_pos.x, output_pos.y,
+        (float)(in[input_pos3]));
-           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
+           out_c,
+           out_w,
+           out_nh,
+           output_pos.x,
+           output_pos.y,
+           (float)(output.x),
+           (float)(output.y),
+           (float)(output.z),
+           (float)(output.w));
  }
 #endif
@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
  const int in_h = in_nh % in_height;
  const sampler_t sampler =
-    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
+      CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
 #ifdef DEBUG
  if (in_w > 2045) {
-    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
-            pos_x, in_nh,
+           in_c,
-           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+           in_w,
+           in_nh,
+           pos_x,
+           in_nh,
+           (float)(in.x),
+           (float)(in.y),
+           (float)(in.z),
+           (float)(in.w));
  }
 #endif
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 3) {
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 4) {
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
  }
 }
+#if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
-#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
 ////////////////////////////////////////////////////////
 // buffer -> image2d_nw
 ////////////////////////////////////////////////////////
@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
 }
 #endif
+#if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
-#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
 // image2d -> buffer
 __kernel void image2d_to_buffer_2d(__private const int in_height,
                                   __private const int in_width,
@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
 ////////////////////////////////////////////////////////
 // buffer -> image2d (divide by 255 to normalize)
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+__kernel void buffer_to_image2d_with_pre255(__global uchar* in,
                                            __write_only image2d_t output_image,
                                            __private const int out_H,
                                            __private const int out_W,
                                            __private const int out_C,
                                            __private const int Stride0,
                                            __private const int Stride1,
-                                            __private const int Stride2){
+                                            __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  const int in_h = out_h;
  const int in_w = out_w;
  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
-  if(out_C - 4 * out_c>=2){
+  if (out_C - 4 * out_c >= 2) {
-      output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
+    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=3){
+  if (out_C - 4 * out_c >= 3) {
-      output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
+    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=4){
+  if (out_C - 4 * out_c >= 4) {
-      output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
+    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
  }
  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
 }
 ////////////////////////////////////////////////////////
 // image2d -> buffer (multiply by 255 to de-normalize)
 ////////////////////////////////////////////////////////
 __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
-                                            __private const int in_width,
+                                             __private const int in_width,
-                                            __private const int in_height,
+                                             __private const int in_height,
-                                            __global uchar* out,
+                                             __global uchar* out,
-                                            __private const int size_ch,
+                                             __private const int size_ch,
-                                            __private const int size_block,
+                                             __private const int size_block,
-                                            __private const int size_batch,
+                                             __private const int size_batch,
-                                            __private const int C) {
+                                             __private const int C) {
  const int in_c = get_global_id(0);
  const int in_w = get_global_id(1);
  const int in_nh = get_global_id(2);
@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
+  CL_COMPUTE_DTYPE4 in =
+      READ_IMG_TYPE(
+          CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
+      255;
 #ifdef DEBUG
  printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
-          in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
+         in_c,
+         in_w,
+         in_nh,
+         pos_x,
+         in_nh,
+         in.x,
+         in.y,
+         in.z,
+         in.w);
 #endif
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = convert_uchar_sat(in.x);
-  if(C - 4 * in_c>=2){
+  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = convert_uchar_sat(in.y);
  }
-  if(C - 4 * in_c>=3){
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = convert_uchar_sat(in.z);
  }
-  if(C - 4 * in_c>=4){
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = convert_uchar_sat(in.w);
  }
 }
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
  bool is_device_init = InitializeDevice();
  is_init_success_ = is_platform_init && is_device_init;
  initialized_ = true;
+  context_ = CreateContext();
+  command_queue_ = CreateCommandQueue(context());
  return initialized_;
 }
@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
 cl::Context& CLRuntime::context() {
  if (context_ == nullptr) {
-    context_ = CreateContext();
+    LOG(FATAL) << "context_ create failed. ";
  }
  return *context_;
 }
@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
 cl::CommandQueue& CLRuntime::command_queue() {
  if (command_queue_ == nullptr) {
-    command_queue_ = CreateCommandQueue(context());
+    LOG(FATAL) << "command_queue_ create failed. ";
  }
  return *command_queue_;
 }
@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
  /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
-  std::string build_option = options + " -cl-fast-relaxed-math ";
+  std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
  VLOG(4) << "OpenCL build_option: " << build_option;
  status_ = program->build({*device_}, build_option.c_str());
  CL_CHECK_ERROR(status_);

--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                 size_t size,
                                 IoDirection dir) {
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                        0,
                                        size,
                                        nullptr,
-                                        &event);
+                                        nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
  cl::array<size_t, 3> origin = {0, 0, 0};
  cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
                                       origin,
                                       region,
                                       nullptr,
-                                       &event);
+                                       nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
-        output_data[i * in_dims[0] + j] =
+        output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
-            input_data[i * in_dims[0] + j] + vector_data[j];
      }
    }
  }

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
      }
    }
    if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
+      VLOG(3) << "no input has value! just return";
      return;
    }
    auto input_width = has_value_input->value().dims()[1];

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -19,6 +19,7 @@ namespace lite {
 #ifdef LITE_WITH_XPU
 thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
 #endif
 }  // namespace lite

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> {
    if (_tls_raw_ctx == nullptr) {
      _tls_raw_ctx = xdnn::create_context();
      CHECK(_tls_raw_ctx);
+      int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
+                                          _workspace_l3_size_per_thread);
+      if (r != 0) {
+        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
+                     << ", _workspace_l3_size_per_thread = "
+                     << _workspace_l3_size_per_thread;
+      }
    }
    return _tls_raw_ctx;
  }
  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
-    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+    _workspace_l3_size_per_thread = l3_size;
  }
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
  static void SetDev(int dev_no = 0) {
    const char* dev_env = getenv("LITE_XPU_DEV");
    if (dev_env) {
@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> {
 private:
  static thread_local xdnn::Context* _tls_raw_ctx;
+  static int _workspace_l3_size_per_thread;
 };
 #endif
@@ -340,27 +350,17 @@ class Context<TargetType::kX86> {
 template <>
 class Context<TargetType::kOpenCL> {
  std::shared_ptr<CLContext> cl_context_;
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
-  std::shared_ptr<WaitListType> cl_wait_list_;
 public:
  CLContext* cl_context() { return cl_context_.get(); }
-  WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
  void InitOnce() {
    // Init cl runtime.
    CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
    cl_context_ = std::make_shared<CLContext>();
-    cl_wait_list_ = std::make_shared<WaitListType>();
  }
-  void CopySharedTo(OpenCLContext* ctx) {
+  void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
-    ctx->cl_context_ = cl_context_;
-    ctx->cl_wait_list_ = cl_wait_list_;
-  }
 };
 #endif

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/scale_activation_fuse_pass.cc
      fusion/__xpu__resnet_fuse_pass.cc
      fusion/__xpu__multi_encoder_fuse_pass.cc
+      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+      fusion/__xpu__fc_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
+      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc

--- a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace {
+class Eliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    // the previous op's output need updat
+    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
+    // TODO(Superjomn) check has only one output
+    auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
+    auto* dropout_op = OpNode("dropout", "dropout")
+                           ->assert_op_attr<int>("is_test", 1)
+                           ->assert_op_attr<std::string>(
+                               "dropout_implementation", "upscale_in_train");
+    auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
+    auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
+    *pre_op >> *x >> *dropout_op >> *out;
+    *dropout_op >> *mask;
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    x->AsIntermediate();  // x is pre_op's output, need to update
+    dropout_op->AsIntermediate();
+    mask->AsIntermediate();
+  }
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& pre_op = matched.at("preop")->AsStmt();
+    auto op_info = *pre_op.op_info();
+    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
+                             matched.at("out")->AsArg().name);
+    pre_op.ResetOp(op_info, graph->valid_places());
+    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
+  }
+};
+}  // namespace
+class IdentityDropoutEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    Eliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
+                  paddle::lite::mir::IdentityDropoutEliminatePass)
+    .BindTargets({TARGET(kXPU)});
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
 lite_cc_library(fuse_sequence_pool_concat
        SRCS sequence_pool_concat_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scale_activation
+        SRCS scale_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 set(mir_fusers
    fuse_fc
@@ -44,6 +47,7 @@ set(mir_fusers
    fuse_transpose_softmax_transpose
    fuse_interpolate
    fuse_sequence_pool_concat
+    fuse_scale_activation
    CACHE INTERNAL "fusers")
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
+ public:
+  explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
+      : n_embedding_(n_embedding) {}
+  void BuildPattern() override {
+    auto* ids0 =
+        VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table0 =
+        VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding0 = OpNode("embedding0", "lookup_table");
+    auto* embedding_out0 = VarNode("embedding_out0")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "X")
+                               ->AsIntermediate();
+    auto* ids1 =
+        VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table1 =
+        VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
+    auto* embedding_out1 = VarNode("embedding_out1")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "Y")
+                               ->AsIntermediate();
+    auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
+    auto* ewadd01_out = VarNode("ewadd01_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->AsIntermediate();
+    embedding0->LinksFrom({ids0, table0});
+    embedding0->LinksTo({embedding_out0});
+    embedding1->LinksFrom({ids1, table1});
+    embedding1->LinksTo({embedding_out1});
+    ewadd01->LinksFrom({embedding_out0, embedding_out1});
+    ewadd01->LinksTo({ewadd01_out});
+    auto* last_ewadd_out = ewadd01_out;
+    for (int i = 2; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      auto embedding_name = paddle::lite::string_format("embedding%d", i);
+      auto embedding_out_name =
+          paddle::lite::string_format("embedding_out%d", i);
+      auto* new_ids = VarNode(ids_name)
+                          ->assert_is_op_input("lookup_table", "Ids")
+                          ->AsInput();
+      auto* new_table = VarNode(table_name)
+                            ->assert_is_op_input("lookup_table", "W")
+                            ->AsInput();
+      auto* new_embedding =
+          OpNode(embedding_name, "lookup_table")->AsIntermediate();
+      auto* new_embedding_out = VarNode(embedding_out_name)
+                                    ->assert_is_op_output("lookup_table", "Out")
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsIntermediate();
+      new_embedding->LinksFrom({new_ids, new_table});
+      new_embedding->LinksTo({new_embedding_out});
+      auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
+      auto ewadd_out_name = ewadd_name + "_out";
+      auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
+      auto* new_ewadd_out = VarNode(ewadd_out_name)
+                                ->assert_is_op_output("elementwise_add", "Out")
+                                ->AsIntermediate();
+      new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
+      new_ewadd->LinksTo({new_ewadd_out});
+      last_ewadd_out = new_ewadd_out;
+    }
+    last_ewadd_out->AsOutput();
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__embedding_with_eltwise_add");
+    std::vector<std::string> ids_names;
+    std::vector<std::string> table_names;
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      ids_names.push_back(matched.at(ids_name)->arg()->name);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      table_names.push_back(matched.at(table_name)->arg()->name);
+    }
+    op_desc.SetInput("Ids", ids_names);
+    op_desc.SetInput("Tables", table_names);
+    auto output_name = paddle::lite::string_format(
+        "ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
+    op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
+    op_desc.SetAttr<int>("n_embedding", n_embedding_);
+    auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
+    op_desc.SetAttr<int64_t>(
+        "padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
+    auto* new_stmt = matched.at("embedding0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      DirectedLink(matched.at(ids_name), matched.at("embedding0"));
+      DirectedLink(matched.at(table_name), matched.at("embedding0"));
+    }
+    IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
+  }
+ private:
+  int n_embedding_;
+};
+}  // namespace fusion
+class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    for (int n_embedding : {4, 3}) {
+      fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
+      fuser(graph.get());
+    }
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
+                  paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("lookup_table");
--- a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class XPUFcFuser : public FuseBase {
+ public:
+  explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
+  void BuildPattern() override {
+    // create nodes.
+    auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+    auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+    auto* b = VarNode("b")->assert_is_persistable_var();
+    auto* mul = OpNode("mul", "mul");
+    auto* mul_out = VarNode("mul_out");
+    auto* add = OpNode("add", "elementwise_add");
+    auto* Out = VarNode("Out");
+    // create topology.
+    std::vector<PMNode*> mul_inputs{W, x};
+    std::vector<PMNode*> add_inputs{mul_out, b};
+    mul_inputs >> *mul >> *mul_out;
+    // Some op specialities.
+    mul_out->AsIntermediate();
+    mul->AsIntermediate();
+    add->AsIntermediate();
+    if (with_relu_) {
+      auto* add_out = VarNode("add_out");
+      auto* relu = OpNode("relu", "relu");
+      std::vector<PMNode*> relu_inputs{add_out};
+      add_inputs >> *add >> *add_out;
+      relu_inputs >> *relu >> *Out;
+      add_out->AsIntermediate();
+      relu->AsIntermediate();
+    } else {
+      add_inputs >> *add >> *Out;
+    }
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto mul = matched.at("mul")->stmt()->op();
+    auto* scope = mul->scope();
+    // convert W from float to int16, and transpose W
+    auto weight_name = matched.at("W")->arg()->name;
+    auto* weight_t = scope->FindMutableTensor(weight_name);
+    auto weight_dims = weight_t->dims();
+    int weight_len = weight_t->numel();
+    float* weight_on_host = weight_t->mutable_data<float>();
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+    std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+    std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        weight_on_host, weight_int16.get(), max_f, weight_len);
+    paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                       weight_trans_int16.get(),
+                                       weight_dims[0],
+                                       weight_dims[1]);
+    memcpy(
+        weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
+    auto op_desc = GenOpDesc(matched, max_f, true);
+    auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
+    auto& valid_places = mul->valid_places();
+    fc_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+    IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+  }
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
+                        float w_max,
+                        bool transpose_w) {
+    cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__fc");
+    op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+    op_desc.SetInput("W", {matched.at("W")->arg()->name});
+    op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+    op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+    op_desc.SetAttr(
+        "in_num_col_dims",
+        matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+    op_desc.SetAttr("w_max", w_max);
+    op_desc.SetAttr("transpose_w", transpose_w);
+    if (with_relu_) {
+      op_desc.SetAttr("activation_type", std::string{"relu"});
+    }
+    return op_desc;
+  }
+  bool with_relu_;
+};
+}  // namespace fusion
+class XPUFcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUFcFuser fuser(true /* with_relu */);
+    fuser(graph.get());
+    fusion::XPUFcFuser fuser2(false /* with_relu */);
+    fuser2(graph.get());
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("fc");
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "lite/backends/xpu/math.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/type_precision_cast_pass.h"  // For UpdateInputs()
 #include "lite/core/mir/xpu_pattern_matcher_high_api.h"
 #include "lite/operators/subgraph_op.h"
@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
    multi_encoder_stmt->SetOp(multi_encoder_op);
    multi_encoder_stmt->SetKernels(std::move(kernels));
-    // temp remove useless cast
+    // remove dangling/useless cast
-    std::unordered_set<const Node*> to_remove2;
    Node* stack = nullptr;
    for (auto* node : graph->StmtTopologicalOrder()) {
      CHECK(node->IsStmt());
@@ -597,16 +597,39 @@ class XPUMultiEncoderFuser {
        stack = node;
      }
    }
-    Node* stack_out = stack->outlinks.front();
+    if (stack) {
-    for (Node* cast : stack_out->outlinks) {
+      std::unordered_set<const Node*> to_remove2;
-      Node* cast_out = cast->outlinks.front();
+      Node* stack_out = stack->outlinks.front();
-      if (cast_out->outlinks.size() == 0) {
+      // avoid modification while traversing
-        // remove
+      auto stack_out_outlinks = stack_out->outlinks;
-        to_remove2.insert(cast_out);
+      for (Node* cast : stack_out_outlinks) {
-        to_remove2.insert(cast);
+        if (cast->stmt()->op_info()->Type() != "cast") {
+          continue;
+        }
+        Node* cast_out = cast->outlinks.front();
+        if (cast_out->outlinks.size() == 0) {
+          // dangling cast
+          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
+        } else if (cast_out->outlinks.size() == 1) {
+          // useless cast
+          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
+          auto* multi_encoder = cast_out->outlinks.front();
+          DirectedLink(stack_out, multi_encoder);
+          UpdateInputs(multi_encoder->stmt()->op().get(),
+                       cast_out->arg()->name,
+                       stack_out->arg()->name);
+          auto update_op_info = *multi_encoder->stmt()->op_info();
+          multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
+        }
      }
+      GraphSafeRemoveNodes(graph, to_remove2);
    }
-    GraphSafeRemoveNodes(graph, to_remove2);
  }
 };

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
  auto conv_weight_t =
      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
+  auto groups = conv_op_desc->GetAttr<int>("groups");
+  bool depthwise = false;
  if (conv_type_ == "conv2d_transpose") {
+    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-             static_cast<size_t>(conv_weight_t->dims()[1]))
+             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
        << "The BN bias's size should be equal to the size of the first "
        << "dim size of the conv weights";
  } else {
@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
    // compute new conv_weight for int8
    auto weight_scale =
        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];

--- a/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
+    fusion::ScaleActivationFuser fuser(act_type);
+    fuser(graph.get());
+  }
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
+                  paddle::lite::mir::ScaleActivationFusePass)
+    .BindTargets({TARGET(kARM)})
+    .BindKernel("scale");
--- a/lite/core/mir/fusion/scale_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+class ScaleActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.cc
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include <memory>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+void ScaleActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
+  // create op nodes
+  auto* scale =
+      OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+  // create intermediate nodes
+  auto* scale_out = VarNode("scale_out")
+                        ->assert_is_op_output("scale", "Out")
+                        ->assert_is_op_input(act_type_, "X")
+                        ->AsIntermediate();
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+  // create topology.
+  *x >> *scale >> *scale_out;
+  *scale_out >> *act >> *out;
+}
+void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
+                                         const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto scale_op = LiteOpRegistry::Global().Create("scale");
+  auto scale = matched.at("scale")->stmt()->op();
+  auto* scope = scale->scope();
+  auto& valid_places = scale->valid_places();
+  scale_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+  op_desc.SetAttr("activation_type", act_type_);
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("alpha", alpha);
+  } else if (act_type_ == "leaky_relu") {
+    float alpha = act_op_desc.GetAttr<float>("alpha");
+    op_desc.SetAttr("alpha", alpha);
+  }
+  return op_desc;
+}
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.h
+++ b/lite/core/mir/fusion/scale_activation_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class ScaleActivationFuser : public FuseBase {
+ public:
+  explicit ScaleActivationFuser(const std::string& act_type) {
+    act_type_ = act_type;
+  }
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+};
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -25,16 +25,16 @@ namespace lite {
 bool OpLite::InferShape() {
  // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
  // InferShapeByMemoryInternal will be applied.
-  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+  if (op_param_ && op_param_->input_tensor_ptrs() &&
+      op_param_->output_tensor_ptrs()) {
    return this->InferShapeWithCache();
  } else {
-    // otherwise, InferShapeImpl is applied directly.
    return this->InferShapeImpl();
  }
 }
 bool OpLite::InferShapeWithCache() {
  // 1. Get vector of current input tensors
-  auto *current_inputs = param_.input_tensor_ptrs();
+  auto *current_inputs = op_param_->input_tensor_ptrs();
  // 2. Get hash value of current inputs shape and lod
  size_t new_hash = 0;
  for (auto iter = current_inputs->begin(); iter != current_inputs->end();
@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() {
  if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
    // if current hash value is consistent with io_shape_lod_hash_,
    // previous outputs shape and lod are reused.
-    auto *current_outputs = param_.output_tensor_ptrs();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
    for (size_t i = 0; i < current_outputs->size(); i++) {
      current_outputs->at(i)->Resize(last_output_shapes[i]);
      current_outputs->at(i)->set_lod(last_output_lods[i]);
@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
    // otherwise, current hash value is changed, InferShapeImpl will apply.
    io_shape_lod_hash_ = new_hash;
    this->InferShapeImpl();
-    auto *current_outputs = param_.output_tensor_ptrs();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
+    last_output_shapes.clear();
+    last_output_lods.clear();
    for (size_t i = 0; i < current_outputs->size(); i++) {
-      last_output_shapes[i] = current_outputs->at(i)->dims();
+      last_output_shapes.push_back(current_outputs->at(i)->dims());
-      last_output_lods[i] = current_outputs->at(i)->lod();
+      last_output_lods.push_back(current_outputs->at(i)->lod());
    }
  }
  return true;

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -77,6 +77,11 @@ class OpLite : public Registry {
  // Link the external execution environ to internal context.
  bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
+  template <typename T>
+  inline void AttachParam(T *param) {
+    op_param_ = static_cast<T *>(param);
+  }
  const OpInfo *op_info() const { return op_info_.get(); }
  OpInfo *mutable_op_info() { return op_info_.get(); }
@@ -167,11 +172,10 @@ class OpLite : public Registry {
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
  std::vector<DDimLite> last_output_shapes{};
  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
  size_t io_shape_lod_hash_{};
-  mutable operators::ParamBase param_;
+  mutable operators::ParamBase *op_param_{nullptr};
 private:
  // Infer Shape according to memory, if current input shapes are consistent

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -111,18 +111,23 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNHWC)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kX86),
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //
@@ -141,9 +146,7 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kInt64),
                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kARM),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -71,12 +71,17 @@ class Optimizer {
           "identity_scale_eliminate_pass",               //
           "elementwise_mul_constant_eliminate_pass",     //
           "lite_sequence_pool_concat_fuse_pass",         //
+           "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
    (defined LITE_WITH_ARM)
           "lite_elementwise_add_activation_fuse_pass",  //
 #endif
           "__xpu__resnet_fuse_pass",
           "__xpu__multi_encoder_fuse_pass",
+           "__xpu__embedding_with_eltwise_add_fuse_pass",
+           "__xpu__fc_fuse_pass",
+           "identity_dropout_eliminate_pass",         // should be placed after
+                                                      // xpu fusion
           "quantized_op_attributes_inference_pass",  // Only for fully
                                                      // quantized model, infer
                                                      // the output scale and

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
--- a/lite/core/scope.cc
+++ b/lite/core/scope.cc
@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
  return nullptr;
 }
+// AttributeVarNames will get persistive attribute names stored in parent scope
+std::vector<std::string> Scope::AttributeVarNames() const {
+  std::vector<std::string> resulted_keys;
+  const Scope *cur_scope = this;
+  while (cur_scope->parent()) {
+    cur_scope = cur_scope->parent();
+    auto keys = cur_scope->LocalVarNames();
+    resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
+  }
+  // remove feed and fetch
+  std::vector<std::string> skiped_vars = {"feed", "fetch"};
+  for (int i = 0; i < skiped_vars.size(); i++) {
+    auto iter =
+        std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    while (iter != resulted_keys.end()) {
+      resulted_keys.erase(iter);
+      iter =
+          std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    }
+  }
+  return resulted_keys;
+}
 std::vector<std::string> Scope::LocalVarNames() const {
  std::vector<std::string> keys;
  for (const auto &item : vars_) {

--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -45,6 +45,8 @@ class Scope final {
  const Scope* parent() const { return parent_; }
+  // Get attribute params stored in parent scopes.
+  std::vector<std::string> AttributeVarNames() const;
  // Following the legacy scope interface.
  std::vector<std::string> LocalVarNames() const;

--- a/lite/demo/cxx/cuda_demo/CMakeLists.txt
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
--- a/lite/demo/cxx/test_libs/classification_full.cc
+++ b/lite/demo/cxx/test_libs/classification_full.cc
--- a/lite/demo/cxx/test_libs/classification_light.cc
+++ b/lite/demo/cxx/test_libs/classification_light.cc
--- a/lite/demo/cxx/test_libs/prepare.sh
+++ b/lite/demo/cxx/test_libs/prepare.sh
--- a/lite/demo/cxx/test_libs/run.sh
+++ b/lite/demo/cxx/test_libs/run.sh
--- a/lite/demo/cxx/test_libs/test_helper.cc
+++ b/lite/demo/cxx/test_libs/test_helper.cc
--- a/lite/demo/cxx/test_libs/test_helper.h
+++ b/lite/demo/cxx/test_libs/test_helper.h
--- a/lite/demo/cxx/test_libs/yolov3_full.cc
+++ b/lite/demo/cxx/test_libs/yolov3_full.cc
--- a/lite/demo/cxx/test_libs/yolov3_light.cc
+++ b/lite/demo/cxx/test_libs/yolov3_light.cc
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -56,7 +56,6 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k
 add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
 ## 3. extra kernels
 add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -88,13 +87,10 @@ add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps
 add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/arm/assign_compute.h
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
--- a/lite/kernels/arm/logical_compute.cc
+++ b/lite/kernels/arm/logical_compute.cc
--- a/lite/kernels/arm/logical_compute.h
+++ b/lite/kernels/arm/logical_compute.h
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ b/lite/kernels/arm/read_from_array_compute.cc
--- a/lite/kernels/arm/read_from_array_compute.h
+++ b/lite/kernels/arm/read_from_array_compute.h
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
--- a/lite/kernels/opencl/conv_buffer_compute_test.cc
+++ b/lite/kernels/opencl/conv_buffer_compute_test.cc
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
--- a/lite/kernels/opencl/dropout_image_compute_test.cc
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
--- a/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute_test.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/layout_image_compute_test.cc
+++ b/lite/kernels/opencl/layout_image_compute_test.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
--- a/lite/kernels/opencl/mul_buffer_compute_test.cc
+++ b/lite/kernels/opencl/mul_buffer_compute_test.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/pool_image_compute_test.cc
+++ b/lite/kernels/opencl/pool_image_compute_test.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute_test.cc
+++ b/lite/kernels/opencl/reshape_image_compute_test.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute_test.cc
+++ b/lite/kernels/opencl/scale_image_compute_test.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
--- a/lite/kernels/x86/sequence_reshape_compute.cc
+++ b/lite/kernels/x86/sequence_reshape_compute.cc
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/arm/assign_compute.cc
--- a/lite/kernels/xpu/__xpu__fc_compute.cc
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
--- a/lite/kernels/xpu/__xpu__fc_compute.h
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/utils.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.h
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.h
--- a/lite/operators/__xpu__fc_op.cc
+++ b/lite/operators/__xpu__fc_op.cc
--- a/lite/operators/__xpu__fc_op.h
+++ b/lite/operators/__xpu__fc_op.h
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
--- a/lite/tests/kernels/logical_compute_test.cc
+++ b/lite/tests/kernels/logical_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ b/lite/tests/kernels/read_from_array_compute_test.cc
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ b/lite/tests/kernels/write_to_array_compute_test.cc
--- a/build.bat
+++ b/build.bat
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/untar.py
+++ b/lite/tools/untar.py
--- a/lite/utils/CMakeLists.txt
+++ b/lite/utils/CMakeLists.txt
--- a/lite/utils/cp_logging.h
+++ b/lite/utils/cp_logging.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ b/mobile/src/operators/bilinear_interp_op.cpp
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_inference_imfix.cpp
+++ b/mobile/test/net/test_inference_imfix.cpp
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ b/mobile/test/net/test_inference_m2fm.cpp
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ b/mobile/test/net/test_mobilenet_GPU.cpp
--- a/mobile/tools/build_android_armv7.sh
+++ b/mobile/tools/build_android_armv7.sh
--- a/mobile/tools/build_android_armv8.sh
+++ b/mobile/tools/build_android_armv8.sh
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake