diff --git a/.gitignore b/.gitignore
index 9823f8c945c1be8e717b622a993d402c49517b7c..dc0a38edcb563589ce3845803174598ca68ec396 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,6 +63,16 @@ test/models/
 
 test/images/
 
+*.pyc
+
+# model
+*.nb
+*.svg
+*.dot
+
+# vim intermediate files
+*.swp
+
 # Emacs intermediate files
 *~
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 5a757659bb036ca99326bc40cc075f761ba6e641..f0cbedcba39258327519f45310f24792b4962b91 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -45,7 +45,7 @@ else()
         # we changed the source code to adapt for windows compiling
         #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
         ######################################################################################################
-        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        URL             http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
         DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
         DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
diff --git a/docs/demo_guides/cuda.md b/docs/demo_guides/cuda.md
index 8b3e76acef590bda19a59388017added6a0b8d52..f863fd86864194c6d022e4cf1fc75eb46725cc2c 100644
--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`
 
 4、 `demo` 文件夹：c++ demo.
 
-如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。
 
 ## 运行
 
@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
 
 二： 运行   
 
-**NOTE:**此处示例使用的是python接口。
+**NOTE：** 此处示例使用的是python接口。
 
 ``` python
 #-*- coding: utf-8 -*-
@@ -75,7 +75,7 @@ import sys
 import numpy as np
 import cv2
 sys.path.append('build_cuda/inference_lite_lib/python/lib')
-from lite_core import *
+from lite import *
 
 def read_img(im_path, resize_h, resize_w):
   im = cv2.imread(im_path).astype('float32')
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index e2b15b187bf6dd3b77fe353f23b5d65bf56e44c7..b89a4de37aafdc17c10fe6cb58b7bda272cc69fb 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
             )
             add_dependencies(publish_inference_android_cxx_demos logging gflags)
             add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 0f60b13f35d51d3917425df75d3f157f8b5a87c3..506f2eab721807abcff64e16470edbc6bcd40842 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_SHUTDOWN_LOG)
   lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
   lite_cc_library(place SRCS paddle_place.cc DEPS glog)
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index f4dcac519a0699cbcf1bdd3845d8ae90d7a289ed..5c89c24325e2aeff0f8b0ed7a5cd621f26318b8f 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 
+// get param names
+std::vector<std::string> Predictor::GetParamNames() {
+  return exec_scope_->AttributeVarNames();
+}
+
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
   if (!program_) {
@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
   // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
   for (auto &valid_place : valid_places) {
+    if (valid_place.target == TARGET(kOpenCL)) continue;
     inner_places.emplace_back(
         Place(TARGET(kHost), valid_place.precision, valid_place.layout));
   }
@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {
 
 const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
   auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
   return &var->Get<lite::Tensor>();
 }
 
+lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
+  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
+  return var->GetMutable<lite::Tensor>();
+}
+
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
   auto element = std::find(input_names_.begin(), input_names_.end(), name);
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 146556756af7e0b56ae38b5303e622c97dfe58af..cd542e87ed3bf4632bce141f019e974af6ef4308 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -85,6 +85,9 @@ class LITE_API Predictor {
   // get inputnames and get outputnames.
   std::vector<std::string> GetInputNames();
   std::vector<std::string> GetOutputNames();
+  // get param names
+  std::vector<std::string> GetParamNames();
+
   void PrepareFeedFetch();
 
   // Get offset-th col of fetch results.
@@ -92,6 +95,9 @@ class LITE_API Predictor {
   std::vector<const lite::Tensor*> GetOutputs() const;
 
   const cpp::ProgramDesc& program_desc() const;
+  // get a mutable tensor according to its name
+  lite::Tensor* GetMutableTensor(const std::string& name);
+  // get a const tensor according to its name
   const lite::Tensor* GetTensor(const std::string& name) const;
   const RuntimeProgram& runtime_program() const;
 
@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
   // get inputs names and get outputs names
   std::vector<std::string> GetInputNames() override;
   std::vector<std::string> GetOutputNames() override;
+  // get param names
+  std::vector<std::string> GetParamNames() override;
 
+  // get tensor according to tensor's name
   std::unique_ptr<const lite_api::Tensor> GetTensor(
       const std::string& name) const override;
+  // get a mutable tensor according to tensor's name
+  std::unique_ptr<lite_api::Tensor> GetMutableTensor(
+      const std::string& name) override;
 
   // Get InputTebsor by name
   std::unique_ptr<lite_api::Tensor> GetInputByName(
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 28e87dca394ba06844269746c19a892c26e0c653..18eb0b3545eeb27c6661c48b9a91dbf413757606 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
   return raw_predictor_.GetInputNames();
 }
 
+std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
+  return raw_predictor_.GetParamNames();
+}
+
 std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
   return raw_predictor_.GetOutputNames();
 }
@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
   return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
 }
 
+std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
+    const std::string &name) {
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
+}
+
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
     const std::string &name) {
   return std::unique_ptr<lite_api::Tensor>(
diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc
index 33c0a94cf1a254e42c47aa462c5cfe12e386a87e..8da192701c9d232196c0dbbc9fd374e214821345 100644
--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
 DEFINE_string(input_shape_0,
               "1,3,224,224",
               "input shapes another, separated by colon and comma");
-
+DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
 DEFINE_bool(use_optimize_nb,
             false,
             "optimized & naive buffer model for mobile devices");
@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
                     const std::vector<std::vector<int64_t>>& input_shapes) {
   lite_api::CxxConfig config;
   config.set_model_dir(load_model_dir);
-  config.set_valid_places({
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
+  if (FLAGS_target == "arm") {
+    config.set_valid_places({
+        Place{TARGET(kARM), PRECISION(kFloat)},
+    });
+  } else if (FLAGS_target == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        Place{TARGET(kARM)},  // enable kARM CPU kernel when no opencl kernel
+    });
+  }
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   // delete old optimized model
@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
          int tid,
          const int warmup_times = 5) {
   lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
   config.set_power_mode(power_mode);
   config.set_threads(thread_num);
 
@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
                     const int repeat,
                     int warmup = 5) {
   lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
   config.set_power_mode(power_mode);
   config.set_threads(thread_num);
 
@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
                     const int repeat,
                     int warmup = 5) {
   lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
   config.set_power_mode(power_mode);
   config.set_threads(thread_num);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
-  config.set_model_dir(model_dir_0);
+  config.set_model_from_file(model_dir_0 + ".nb");
   auto predictor_0 = lite_api::CreatePaddlePredictor(config);
 
   for (int i = 0; i < 2 * repeat; i += 2) {
@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (FLAGS_model_dir == "") {
     LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model";
+              << "--model_dir /path/to/your/model --model_dir_0 "
+                 "/path/to/your/model0  --target `arm` or `opencl`";
     exit(0);
   }
   std::string save_optimized_model_dir = "";
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index a6ad7cff6f234187770eccf1501378c04201b729..a1b963ac4ebf836e29045c8810658e0b30bad2f2 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
     optimize_out_type,
-    "protobuf",
+    "naive_buffer",
     "store type of the output optimized model. protobuf/naive_buffer");
 DEFINE_bool(display_kernels, false, "Display kernel information");
 DEFINE_bool(record_tailoring_info,
@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
   }
   std::cout << std::setiosflags(std::ios::internal);
   std::cout << std::setw(maximum_optype_length) << "OP_name";
-  for (int i = 0; i < targets.size(); i++) {
+  for (size_t i = 0; i < targets.size(); i++) {
     std::cout << std::setw(10) << targets[i].substr(1);
   }
   std::cout << std::endl;
@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
     for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
       std::cout << std::setw(maximum_optype_length) << it->first;
       auto ops_valid_places = it->second;
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
         if (std::find(ops_valid_places.begin(),
                       ops_valid_places.end(),
                       targets[i]) != ops_valid_places.end()) {
@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
       }
       // Print OP info.
       auto ops_valid_places = supported_ops.at(*op);
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
         if (std::find(ops_valid_places.begin(),
                       ops_valid_places.end(),
                       targets[i]) != ops_valid_places.end()) {
@@ -288,11 +288,11 @@ void ParseInputCommand() {
     auto valid_places = paddle::lite_api::ParserValidPlaces();
     // get valid_targets string
     std::vector<TargetType> target_types = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
       target_types.push_back(valid_places[i].target);
     }
     std::string targets_str = TargetToStr(target_types[0]);
-    for (int i = 1; i < target_types.size(); i++) {
+    for (size_t i = 1; i < target_types.size(); i++) {
       targets_str = targets_str + TargetToStr(target_types[i]);
     }
 
@@ -301,7 +301,7 @@ void ParseInputCommand() {
     target_types.push_back(TARGET(kUnk));
 
     std::set<std::string> valid_ops;
-    for (int i = 0; i < target_types.size(); i++) {
+    for (size_t i = 0; i < target_types.size(); i++) {
       auto ops = supported_ops_target[static_cast<int>(target_types[i])];
       valid_ops.insert(ops.begin(), ops.end());
     }
@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
   auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
   valid_ops.insert(
       valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
-  for (int i = 0; i < valid_places.size(); i++) {
+  for (size_t i = 0; i < valid_places.size(); i++) {
     auto target = valid_places[i].target;
     auto ops = supported_ops_target[static_cast<int>(target)];
     valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
@@ -340,7 +340,7 @@ void CheckIfModelSupported() {
 
   std::set<std::string> unsupported_ops;
   std::set<std::string> input_model_ops;
-  for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
     auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
     for (size_t i = 0; i < current_block->OpsSize(); ++i) {
       auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
       unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
     }
     std::vector<TargetType> targets = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
       targets.push_back(valid_places[i].target);
     }
     std::sort(targets.begin(), targets.end());
     targets.erase(unique(targets.begin(), targets.end()), targets.end());
     std::string targets_str = TargetToStr(targets[0]);
-    for (int i = 1; i < targets.size(); i++) {
+    for (size_t i = 1; i < targets.size(); i++) {
       targets_str = targets_str + "," + TargetToStr(targets[i]);
     }
 
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
index 14c1ca4a4e9c19d2d3c27b783267682457eeddb2..5af001961af6e4064e45174f1537d0c6f05e6c07 100644
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
          "command argument 'valid_targets'";
 }
 
-void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
-  optimize_out_path_ = optimized_out_path;
+void OptBase::SetLiteOut(const std::string& lite_out_name) {
+  lite_out_name_ = lite_out_name;
 }
 
-void OptBase::RunOptimize(bool record_strip_info) {
+void OptBase::RecordModelInfo(bool record_strip_info) {
+  record_strip_info_ = record_strip_info;
+}
+
+void OptBase::Run() {
   CheckIfModelSupported(false);
   OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
   opt_config_.set_valid_places(valid_places_);
   if (model_set_dir_ != "") {
-    RunOptimizeFromModelSet(record_strip_info);
+    RunOptimizeFromModelSet(record_strip_info_);
   } else {
     auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
     opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info_);
     auto resulted_model_name =
-        record_strip_info ? "information of striped model" : "optimized model";
+        record_strip_info_ ? "information of striped model" : "optimized model";
     std::cout << "Save the " << resulted_model_name
-              << " into :" << optimize_out_path_ << "successfully";
+              << " into :" << lite_out_name_ << "successfully";
   }
 }
 
+void OptBase::RunOptimize(const std::string& model_dir_path,
+                          const std::string& model_path,
+                          const std::string& param_path,
+                          const std::string& valid_places,
+                          const std::string& optimized_out_path) {
+  SetModelDir(model_dir_path);
+  SetModelFile(model_path);
+  SetParamFile(param_path);
+  SetValidPlaces(valid_places);
+  SetLiteOut(optimized_out_path);
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info_);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        lite_out_name_, model_type_, record_strip_info_);
+    auto resulted_model_name =
+        record_strip_info_ ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << lite_out_name_ << "successfully";
+  }
+}
 // collect ops info of modelset
 void CollectModelMetaInfo(const std::string& output_dir,
                           const std::vector<std::string>& models,
@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
 }
 void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
   // 1. mkdir of outputed optimized model set.
-  lite::MkDirRecur(optimize_out_path_);
+  lite::MkDirRecur(lite_out_name_);
   auto model_dirs = lite::ListDir(model_set_dir_, true);
   if (model_dirs.size() == 0) {
     LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
     std::string input_model_dir =
         lite::Join<std::string>({model_set_dir_, name}, "/");
     std::string output_model_dir =
-        lite::Join<std::string>({optimize_out_path_, name}, "/");
+        lite::Join<std::string>({lite_out_name_, name}, "/");
 
     if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
       auto model_file_path =
@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
 
     auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
     opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info);
 
     std::cout << "Optimize done. ";
   }
@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
   if (record_strip_info) {
     // Collect all models information
     CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
     CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
-    CollectModelMetaInfo(optimize_out_path_,
-                         model_dirs,
-                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
     CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
     std::cout << "Record the information of stripped models into :"
-              << optimize_out_path_ << "successfully";
+              << lite_out_name_ << "successfully";
   }
 }
 
 void OptBase::PrintHelpInfo() {
   const std::string opt_version = lite::version();
   const char help_info[] =
-      "At least one argument should be inputed. Valid arguments are listed "
-      "below:\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n"
+      "  Valid arguments of Paddle-Lite opt are listed below:\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n"
       "  Arguments of help information:\n"
       "        `help()`   Print help infomation\n"
-      "  Arguments of model optimization:\n"
+      "\n"
+      "  Arguments of model transformation:\n"
       "        `set_model_dir(model_dir)`\n"
       "        `set_model_file(model_file_path)`\n"
       "        `set_param_file(param_file_path)`\n"
-      "        `set_model_type(protobuf|naive_buffer)`\n"
-      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
+      "default\n"
+      "        `set_lite_out(output_optimize_model_dir)`\n"
       "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
-      "        `run_optimize(false|true)`\n"
-      "        `  ----fasle&true refer to whether to record ops info for "
-      "tailoring lib, false by default`\n"
-      "  Arguments of model checking and ops information:\n"
+      "        `record_model_info(false|true)`: refer to whether to record ops "
+      "info for striping lib, false by default`\n"
+      "        `run() : start model transformation`\n"
+      "    eg. `opt.set_model_dir(\"./mobilenetv1\"); "
+      "opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
+      "opt.run();`\n"
+      "\n"
+      "  You can also transform model through a single input argument:\n"
+      "        `run_optimize(model_dir, model_file_path, param_file_path, "
+      "model_type, valid_places, lite_out_name) `\n"
+      "    eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
+      "\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
+      "\n"
+      "  Arguments of checking model and printing ops information:\n"
       "        `print_all_ops()`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `print_supported_ops`   Display supported operators of valid "
       "places\n"
       "        `check_if_model_supported()`   Check if the input model is "
-      "supported\n";
-
-  std::cout << "opt version:" << opt_version << std::endl
-            << help_info << std::endl;
+      "supported\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n";
+  std::cout << "opt version:" << opt_version << std::endl << help_info;
 }
 // 2. Print supported info of inputed ops
 void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h
index a8d6d0390ccd3f1c9b0291b1bcf6eb1ecc47a248..3c0051375d0c09d09e0e070df273c94e7a668750 100644
--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
@@ -44,16 +44,21 @@ class LITE_API OptBase {
  public:
   OptBase() = default;
   void SetModelSetDir(const std::string &model_set_path);
-  void SetModelDir(const std::string &model_path);
+  void SetModelDir(const std::string &model_dir_path);
   void SetModelFile(const std::string &model_path);
   void SetParamFile(const std::string &param_path);
   void SetValidPlaces(const std::string &valid_places);
-  void SetOptimizeOut(const std::string &optimized_out_path);
+  void SetLiteOut(const std::string &lite_out_name);
+  void RecordModelInfo(bool record_strip_info = true);
   // set optimized_model type
   void SetModelType(std::string model_type);
   // transform and save the optimized model
-  void RunOptimize(bool record_strip_info = false);
-
+  void Run();
+  void RunOptimize(const std::string &model_dir_path = "",
+                   const std::string &model_path = "",
+                   const std::string &param_path = "",
+                   const std::string &valid_places = "",
+                   const std::string &optimized_out_path = "");
   // fuctions of printing info
   // 1. help info
   void PrintHelpInfo();
@@ -71,12 +76,12 @@ class LITE_API OptBase {
   // valid places for the optimized_model
   std::vector<Place> valid_places_;
   // filename of the optimized_model
-  std::string optimize_out_path_;
+  std::string lite_out_name_;
   // type of the optimized_model, kNaiveBuffer default.
   LiteModelType model_type_{LiteModelType::kNaiveBuffer};
   // Dir path of a set of models, this should be combined with model
   std::string model_set_dir_;
-
+  bool record_strip_info_{false};
   void RunOptimizeFromModelSet(bool record_strip_info = false);
 };
 
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index daef2c66dda5188a1eec25c3d5f045f1fa705e1e..4b13ae4ed241eb1a3164a1213feec12306df89f6 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
 
 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
 
+std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
+    const std::string &name) {
+  LOG(FATAL)
+      << "The GetMutableTensor API is only supported by CxxConfig predictor.";
+  return nullptr;
+}
+
+std::vector<std::string> PaddlePredictor::GetParamNames() {
+  std::vector<std::string> null_result = {};
+  LOG(FATAL)
+      << "The GetParamNames API is only supported by CxxConfig predictor.";
+  return null_result;
+}
+
 void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
                                          LiteModelType model_type,
                                          bool record_info) {
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 79ab98da799a99540217d55e3d40b46800f17626..b08f2f5c745f87cda2be181bdea2444b2c11313c 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
   virtual std::vector<std::string> GetInputNames() = 0;
   // Get output names
   virtual std::vector<std::string> GetOutputNames() = 0;
+  // Get output names
+  virtual std::vector<std::string> GetParamNames();
 
   // Get Input by name
   virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
   /// Get a readonly tensor, return null if no one called `name` exists.
   virtual std::unique_ptr<const Tensor> GetTensor(
       const std::string& name) const = 0;
+  /// Get a mutable tensor, return null if on one called `name` exists
+  /// internal infereces API, not recommanded.
+  virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);
 
   /// Persist the optimized model to disk. This API is only supported by
   /// CxxConfig, and the persisted model can be reused for MobileConfig.
@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
 #endif
 #ifdef LITE_WITH_CUDA
   void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
-  int multi_stream() const { return multi_stream_; }
+  bool multi_stream() const { return multi_stream_; }
 #endif
 
 #ifdef LITE_WITH_MLU
@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
   // current thread.
   void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
   // XPU only, specify the target device ID for the current thread.
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
   void set_xpu_dev_per_thread(int dev_no = 0);
 };
 
diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h
index 9dc5c9e857243ecb57f785737b00929e36c5d83c..5ce6a9ac9433d720c005d84712ed181d075c61b4 100644
--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -19,7 +19,13 @@
 #pragma once
 
 // some platform-independent defintion
-#include "lite/utils/macros.h"
+
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
 
 #define USE_LITE_OP(op_type__)       \
   extern int touch_op_##op_type__(); \
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 82cd7f3d8da5eb4f00c9069731960a81ef9fe87d..8cb4dbf192993219347d70bb8ccb704199b45f3d 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
 USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
+USE_MIR_PASS(identity_dropout_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
 USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
+USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
+USE_MIR_PASS(__xpu__fc_fuse_pass);
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 06d1c607fd761f9f6e58a4c5779e2c3cb9f4e6b3..104275e2e9cf157d7d2f7ca963a1abed2983b92e 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
       .def("set_model_file", &OptBase::SetModelFile)
       .def("set_param_file", &OptBase::SetParamFile)
       .def("set_valid_places", &OptBase::SetValidPlaces)
-      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_lite_out", &OptBase::SetLiteOut)
       .def("set_model_type", &OptBase::SetModelType)
+      .def("record_model_info", &OptBase::RecordModelInfo)
+      .def("run", &OptBase::Run)
       .def("run_optimize", &OptBase::RunOptimize)
       .def("help", &OptBase::PrintHelpInfo)
       .def("print_supported_ops", &OptBase::PrintSupportedOps)
diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in
index b04a6077f5aafecf76fed0b0dee5c56919b9302e..884266a12dc911f6e642518b169370d7aeb83cca 100644
--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -50,7 +50,7 @@ if '${WITH_MKL}' == 'ON':
 # link lite.so to paddlelite.libs
 if os.name != 'nt':
     COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-    /inference_lite_lib/python/install/lite/lite.so"
+/inference_lite_lib/python/install/lite/lite.so"
     if os.system(COMMAND) != 0:
         raise Exception("patch third_party libs failed, command: %s" % COMMAND)
 
diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
index d1992f62bbfa9e15ab4d39565f7fe3555e17b215..35d9eeaee1b69bed423cd3b489217c71575b3079 100644
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
                           const operators::ConvParam& param,
                           ARMContext* ctx) {
   auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
   float* tmp_work_space =
       ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
 
@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
   int tile_h = (hout + 5) / 6;
   int size_tile = tile_h * tile_w;
 
-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;
 
   const int zero_len = w_pad;
   float zero_ptr[zero_len];  // NOLINT
@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
       prepack_input_nxwc4_dw(input + ni * in_n_stride,
                              input_c4 + i * new_c_stride,
                              i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                              chin,
                              win,
                              hin,
@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
                           const operators::ConvParam& param,
                           ARMContext* ctx) {
   auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
   float* tmp_work_space =
       ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
 
@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
   int tile_h = (hout + 1) / 2;
   int size_tile = tile_h * tile_w;
 
-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;
 
   const int zero_len = w_pad;
   float zero_ptr[zero_len];  // NOLINT
@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
       prepack_input_nxwc4_dw(input + ni * in_n_stride,
                              input_c4 + i * new_c_stride,
                              i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                              chin,
                              win,
                              hin,
@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
                                 const operators::ConvParam& param,
                                 ARMContext* ctx) {
   auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
   float* tmp_work_space =
       ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
 
@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
   int tile_h = (hout + 1) / 2;
   int size_tile = tile_h * tile_w;
 
-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;
 
   const int zero_len = w_pad;
   float zero_ptr[zero_len];  // NOLINT
@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
       prepack_input_nxwc4_dw(input + ni * in_n_stride,
                              input_c4 + i * new_c_stride,
                              i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                              chin,
                              win,
                              hin,
diff --git a/lite/backends/arm/math/lstm.cc b/lite/backends/arm/math/lstm.cc
index 5a2a263bb4fa2dc7b4ec54d84c698651a058f933..cd8e012a287437ac9527ca510f927be30d825f0c 100644
--- a/lite/backends/arm/math/lstm.cc
+++ b/lite/backends/arm/math/lstm.cc
@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
     for (int w = start_w; w < w_adds; ++w) {
       i_data[w] += b_data[w];
     }
+    i_data += width;
   }
 }
 void vector_dot(
@@ -67,15 +68,8 @@ void vector_dot(
   for (int i = 0; i < remain; ++i) {
     if (!v2) {
       out_ptr[i] = in_ptr[i] * v1_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
     } else {
       out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
-      ++out_ptr;
-      ++in_ptr;
-      ++v1_ptr;
-      ++v2_ptr;
     }
   }
 }
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
index b41afc1c29e121f905b0abc48bae98705bc0ee16..2e869f2df3a292b264dae948f13c64e05854d052 100644
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
@@ -72,6 +72,7 @@ void pack_trans_m4(float *out,
                    int mmax,
                    int k0,
                    int kmax);
+
 void sgemm_prepacked_4x4(bool is_transB,
                          int M,
                          int N,
@@ -154,6 +155,20 @@ void sgemm_prepacked_4x8(bool is_transB,
                          bool has_bias,
                          const operators::ActivationParam act_param,
                          ARMContext *ctx);
+// for kA53
+void sgemm_prepacked_6x8_a53(bool is_transB,
+                             int M,
+                             int N,
+                             int K,
+                             const float *A_packed,
+                             const float *B,
+                             int ldb,
+                             float *C,
+                             int ldc,
+                             const float *bias,
+                             bool has_bias,
+                             int is_relu,
+                             ARMContext *ctx);
 #endif  // __aarch64__
 
 /**
@@ -300,6 +315,44 @@ void sgemm_prepack(bool is_transB,
                         has_bias,
                         act_param,
                         ctx);
+  } else if (ctx->arch() == kA53) {
+    auto act_type = act_param.active_type;
+    bool has_act = act_param.has_active;
+    bool act_flag =
+        (has_act == false) ||
+        (has_act == true && act_type == lite_api::ActivationType::kRelu);
+    bool has_beta = fabsf(beta) > 1e-8f ? true : false;
+    bool a53_sgemm = act_flag && !has_beta;
+    if (a53_sgemm) {
+      sgemm_prepacked_6x8_a53(is_transB,
+                              M,
+                              N,
+                              K,
+                              A_packed,
+                              B,
+                              ldb,
+                              C,
+                              ldc,
+                              bias,
+                              has_bias,
+                              static_cast<int>(has_act),
+                              ctx);
+    } else {
+      sgemm_prepacked_6x8(is_transB,
+                          M,
+                          N,
+                          K,
+                          A_packed,
+                          B,
+                          ldb,
+                          beta,
+                          C,
+                          ldc,
+                          bias,
+                          has_bias,
+                          act_param,
+                          ctx);
+    }
   } else {
     sgemm_prepacked_6x8(is_transB,
                         M,
@@ -3983,6 +4036,472 @@ void sgemm_prepacked_6x8(bool is_transB,
   }
 }
 
+/**
+ * \brief gemm with ablock = 6, bblock = 8, output 6x8, optimize for a53 arch
+ * @param A
+ * @param B
+ * @param C
+ * @param M
+ * @param N
+ * @param K
+ * @param threads
+ * @param workspace
+ */
+void sgemm_prepacked_6x8_a53(bool is_transB,
+                             int M,
+                             int N,
+                             int K,
+                             const float* A_packed,
+                             const float* B,
+                             int ldb,
+                             float* C,
+                             int ldc,
+                             const float* bias,
+                             bool has_bias,
+                             int is_relu,
+                             ARMContext* ctx) {
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  auto* workspace = ctx->workspace_data<float>();
+  int threads = ctx->threads();
+  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
+  int x_block =
+      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
+  x_block /= NBLOCK;
+  x_block *= NBLOCK;
+  int x_num = (N + (x_block - 1)) / x_block;
+  x_block = (N + x_num - 1) / x_num;
+  x_block = (x_block + NBLOCK - 1) / NBLOCK;
+  x_block *= NBLOCK;
+  x_block = x_block < NBLOCK ? NBLOCK : x_block;
+
+  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
+  int tail_pre = (K & (KBLOCK - 1));
+  if (tail_pre == 0) {
+    tail_pre = KBLOCK;
+  }
+
+  //! merge tail_pre and flag_act
+  tail_pre = (tail_pre << 2 | is_relu);
+  bool flag_p_remain = false;
+  int remain = 0;
+
+  //! apanel is pre_compute outside gemm
+  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
+    unsigned int xmax = x0 + x_block;
+    if (xmax > N) {
+      xmax = N;
+    }
+    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
+    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
+    if (remain > 0) {
+      flag_p_remain = true;
+    }
+    //! load bpanel
+    auto b_pannel = static_cast<float*>(workspace);
+    if (is_transB) {
+      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
+    } else {
+      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
+    }
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
+      unsigned int ymax = y + MBLOCK_OTH;
+      if (ymax > M) {
+        ymax = M;
+      }
+      float* c_ptr0 = C + y * ldc + x0;
+      float* c_ptr1 = c_ptr0 + ldc;
+      float* c_ptr2 = c_ptr1 + ldc;
+      float* c_ptr3 = c_ptr2 + ldc;
+      float* c_ptr4 = c_ptr3 + ldc;
+      float* c_ptr5 = c_ptr4 + ldc;
+
+      float* pout0 = c_ptr0;
+      float* pout1 = c_ptr1;
+      float* pout2 = c_ptr2;
+      float* pout3 = c_ptr3;
+      float* pout4 = c_ptr4;
+      float* pout5 = c_ptr5;
+
+      float bias_local[6] = {0};
+      if (has_bias) {
+        bias_local[0] = bias[y];
+        bias_local[1] = bias[y + 1];
+        bias_local[2] = bias[y + 2];
+        bias_local[3] = bias[y + 3];
+        bias_local[4] = bias[y + 4];
+        bias_local[5] = bias[y + 5];
+      }
+
+      float cout0[NBLOCK];
+      float cout1[NBLOCK];
+      float cout2[NBLOCK];
+      float cout3[NBLOCK];
+      float cout4[NBLOCK];
+      float cout5[NBLOCK];
+
+      const float* a_ptr_l = A_packed + y * K;
+      const float* b_ptr = b_pannel;
+      for (int xb = 0; xb < bblocks; xb++) {
+        if ((y + 5) >= ymax) {
+          switch ((y + 5) - ymax) {
+            case 4:
+              c_ptr1 = cout1;
+            case 3:
+              c_ptr2 = cout2;
+            case 2:
+              c_ptr3 = cout3;
+            case 1:
+              c_ptr4 = cout4;
+            case 0:
+              c_ptr5 = cout5;
+            default:
+              break;
+          }
+        }
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          pout0 = c_ptr0;
+          pout1 = c_ptr1;
+          pout2 = c_ptr2;
+          pout3 = c_ptr3;
+          pout4 = c_ptr4;
+          pout5 = c_ptr5;
+
+          c_ptr0 = cout0;
+          c_ptr1 = cout1;
+          c_ptr2 = cout2;
+          c_ptr3 = cout3;
+          c_ptr4 = cout4;
+          c_ptr5 = cout5;
+        }
+        const float* a_ptr = a_ptr_l;
+        int tails = tail_pre;
+        int k = k_pre;
+
+        // clang-format off
+        asm volatile(
+            // sgemm 6x8 for a53
+            "vld1.32  {d2-d3},  [%[bias_ptr]]   \n"   /* load bias0-3 to d2,d3 */
+            "vdup.i32 q4, d2[0]                 \n"   /*  set out00 to bias0   */
+            "vld1.32	{d0-d1},  [%[a_ptr] :64]  \n"   /* load a00-a30 to d0,d1 */
+            "vdup.i32	q5, d2[0]                 \n"   /*  set out01 to bias0   */
+            "vld1.32	{d4-d5},  [%[b_ptr] :128] \n"   /* load b00-b03 to d4,d5 */
+            "vdup.i32	q6, d2[1]                 \n"   /*  set out10 to bias1   */
+            "ldr  r0, [%[a_ptr], #0x10]         \n"   /*    load a40 to r0     */
+            "vdup.i32	q7, d2[1]                 \n"   /*  set out11 to bias1   */
+            "ldr  r1, [%[a_ptr], #0x14]         \n"   /*    load a50 to r1     */
+            "vdup.i32	q8, d3[0]                 \n"   /*  set out20 to bias2   */
+            "vldr d6, [%[bias_ptr], #0x10]      \n"   /*  load bias 4,5 to d6  */
+            "pld [%[a_ptr], #0x40]              \n"   /*    pre load apanel    */
+            "vdup.i32	q9, d3[0]                 \n"   /*  set out21 to bias2   */
+            "pld [%[b_ptr], #0x40]              \n"   /*    pre load bpanel    */
+            "vdup.i32	q10, d3[1]                \n"   /*  set out30 to bias3   */
+            "pld [%[a_ptr], #0x80]              \n"   /*    pre load apanel    */
+            "vdup.i32	q11, d3[1]                \n"   /*  set out31 to bias3   */
+            "pld [%[b_ptr], #0x80]              \n"   /*    pre load bpanel    */
+            "vdup.i32	q12, d6[0]                \n"   /*  set out40 to bias4   */
+            "vdup.i32	q13, d6[0]                \n"   /*  set out41 to bias4   */
+            "pld [%[a_ptr], #0xC0]              \n"   /*    pre load apanel    */
+            "vdup.i32	q14, d6[1]                \n"   /*  set out50 to bias5   */
+            "pld [%[b_ptr], #0XC0]              \n"   /*    pre load bpanel    */
+            "vdup.i32	q15, d6[1]                \n"   /*  set out51 to bias5   */
+            "cmp  %[k], #0                      \n"   /*      check k loop     */
+            "beq  6f                            \n"   /*   k==0, branch to 6   */
+            "1:\n"
+            /* Unroll 0 */
+            "vldr d6, [%[b_ptr], #0x10]         \n"   /*  load b04, b05 to d6  */
+            "vmov d2, r0, r1                    \n"   /*   mov a40, a50 to d2  */
+            "vmla.f32	q4, q2, d0[0]             \n"   /*   out00 += a00 * b0l  */
+            "ldr  r0, [%[b_ptr], #0x18]         \n"   /*    load b06 to r0     */
+            "vmla.f32	q6, q2, d0[1]             \n"   /*   out10 += a10 * b0l  */
+            "ldr  r1, [%[b_ptr], #0x1C]         \n"   /*    load b07 to r1     */
+            "vmla.f32	q8, q2, d1[0]             \n"   /*   out20 += a20 * b0l  */
+            "vldr d3, [%[a_ptr], #0x18]         \n"   /*  load a01, a11 to d3  */
+            "vmov d7, r0, r1                    \n"   /*   mov b06, b07 to d7  */
+            "vmla.f32	q10, q2, d1[1]            \n"   /*   out30 += a30 * b0l  */
+            "pld [%[a_ptr], #0x100]             \n"   /*    pre load apanel    */
+            "vmla.f32	q12, q2, d2[0]            \n"   /*   out40 += a40 * b0l  */
+            "vmla.f32	q14, q2, d2[1]            \n"   /*   out50 += a50 * b0l  */
+            "vldr d4, [%[b_ptr], #0x20]         \n"   /*  load b10, b11 to d4  */
+            "vmla.f32	q5, q3, d0[0]             \n"   /*   out01 += a00 * b0h  */
+            "ldr  r0, [%[b_ptr], #0x28]         \n"   /*    load b12 to r0     */
+            "vmla.f32	q7, q3, d0[1]             \n"   /*   out11 += a10 * b0h  */
+            "ldr  r1, [%[b_ptr], #0x2C]         \n"   /*    load b13 to r1     */
+            "vmla.f32	q9, q3, d1[0]             \n"   /*   out21 += a20 * b0h  */
+            "vldr d0, [%[a_ptr], #0x20]         \n"   /*  load a21, a31 to d0  */
+            "vmov d5, r0, r1                    \n"   /*   mov b12, b13 to d5  */
+            "vmla.f32	q11, q3, d1[1]            \n"   /*   out31 += a30 * b0h  */
+            "ldr  r0, [%[a_ptr], #0x28]         \n"   /*    load a41 to r0     */
+            "vmla.f32	q13, q3, d2[0]            \n"   /*   out41 += a40 * b0h  */
+            "ldr  r1, [%[a_ptr], #0x2C]         \n"   /*    load a51 to r1     */
+            "vmla.f32	q15, q3, d2[1]            \n"   /*   out51 += a50 * b0h  */
+            /* Unroll 1 */
+            "vldr d6, [%[b_ptr], #0x30]         \n"   /*  load b14, b15 to d6  */
+            "vmov d1, r0, r1                    \n"   /*   mov a41, a51 to d1  */
+            "vmla.f32	q4, q2, d3[0]             \n"   /*   out00 += a01 * b1l  */
+            "ldr  r0, [%[b_ptr], #0x38]         \n"   /*    load b16 to r0     */
+            "vmla.f32	q6, q2, d3[1]             \n"   /*   out10 += a11 * b1l  */
+            "ldr  r1, [%[b_ptr], #0x3C]         \n"   /*    load b17 to r1     */
+            "vmla.f32	q8, q2, d0[0]             \n"   /*   out20 += a21 * b1l  */
+            "vldr d2, [%[a_ptr], #0x30]         \n"   /*  load a02, a12 to d0  */
+            "vmov d7, r0, r1                    \n"   /*   mov b16, b17 to d7  */
+            "vmla.f32	q10, q2, d0[1]            \n"   /*   out30 += a31 * b1l  */
+            "pld [%[b_ptr], #0x100]             \n"   /*    pre load apanel    */
+            "vmla.f32	q12, q2, d1[0]            \n"   /*   out40 += a41 * b1l  */
+            "vmla.f32	q14, q2, d1[1]            \n"   /*   out50 += a51 * b1l  */
+            "vldr d4, [%[b_ptr], #0x40]         \n"   /*  load b20, b21 to d4  */
+            "vmla.f32	q5, q3, d3[0]             \n"   /*   out01 += a01 * b1h  */
+            "ldr  r0, [%[b_ptr], #0x48]         \n"   /*    load b22 to r0     */
+            "vmla.f32	q7, q3, d3[1]             \n"   /*   out11 += a11 * b1h  */
+            "ldr  r1, [%[b_ptr], #0x4C]         \n"   /*    load b23 to r1     */
+            "vmla.f32	q9, q3, d0[0]             \n"   /*   out21 += a21 * b1h  */
+            "vldr d3, [%[a_ptr], #0x38]         \n"   /*  load a22, a32 to d3  */
+            "vmov d5, r0, r1                    \n"   /*   mov b22, b23 to d5  */
+            "vmla.f32	q11, q3, d0[1]            \n"   /*   out31 += a31 * b1h  */
+            "ldr  r0, [%[a_ptr], #0x40]         \n"   /*    load a42 to r0     */
+            "vmla.f32	q13, q3, d1[0]            \n"   /*   out41 += a41 * b1h  */
+            "ldr  r1, [%[a_ptr], #0x44]         \n"   /*    load a52 to r1     */
+            "vmla.f32	q15, q3, d1[1]            \n"   /*   out51 += a51 * b1h  */
+            /* Unroll 2 */
+            "vldr d6, [%[b_ptr], #0x50]         \n"   /*  load b24, b25 to d6  */
+            "vmov d0, r0, r1                    \n"   /*   mov a42, a52 to d0  */
+            "vmla.f32	q4, q2, d2[0]             \n"   /*   out00 += a02 * b2l  */
+            "ldr  r0, [%[b_ptr], #0x58]         \n"   /*    load b26 to r0     */
+            "vmla.f32	q6, q2, d2[1]             \n"   /*   out10 += a12 * b2l  */
+            "ldr  r1, [%[b_ptr], #0x5C]         \n"   /*    load b27 to r1     */
+            "vmla.f32	q8, q2, d3[0]             \n"   /*   out20 += a22 * b2l  */
+            "vldr d1, [%[a_ptr], #0x48]         \n"   /*  load a03, a13 to d1  */
+            "vmov d7, r0, r1                    \n"   /*   mov b26, b27 to d7  */
+            "vmla.f32	q10, q2, d3[1]            \n"   /*   out30 += a32 * b2l  */
+            "pld [%[a_ptr], #0x140]             \n"   /*    pre load apanel    */
+            "vmla.f32	q12, q2, d0[0]            \n"   /*   out40 += a42 * b2l  */
+            "vmla.f32	q14, q2, d0[1]            \n"   /*   out50 += a52 * b2l  */
+            "vldr d4, [%[b_ptr], #0x60]         \n"   /*  load b30, b31 to d4  */
+            "vmla.f32	q5, q3, d2[0]             \n"   /*   out01 += a02 * b2h  */
+            "ldr  r0, [%[b_ptr], #0x68]         \n"   /*    load b32 to r0     */
+            "vmla.f32	q7, q3, d2[1]             \n"   /*   out11 += a12 * b2h  */
+            "ldr  r1, [%[b_ptr], #0x6C]         \n"   /*    load b33 to r1     */
+            "vmla.f32	q9, q3, d3[0]             \n"   /*   out21 += a22 * b2h  */
+            "vldr d2, [%[a_ptr], #0x50]         \n"   /*  load a23, a33 to d2  */
+            "vmov d5, r0, r1                    \n"   /*   mov b32, b33 to d5  */
+            "vmla.f32	q11, q3, d3[1]            \n"   /*   out31 += a32 * b2h  */
+            "ldr  r0, [%[a_ptr], #0x58]         \n"   /*    load a43 to r0     */
+            "vmla.f32	q13, q3, d0[0]            \n"   /*   out41 += a42 * b2h  */
+            "ldr  r1, [%[a_ptr], #0x5C]         \n"   /*    load a53 to r1     */
+            "vmla.f32	q15, q3, d0[1]            \n"   /*   out51 += a52 * b2h  */
+            "add  %[a_ptr], %[a_ptr], #0x60     \n"   /*      aptr += 96       */
+            /* Unroll 3 */
+            "vldr d6, [%[b_ptr], #0x70]         \n"   /*  load b34, b35 to d6  */
+            "vmov d3, r0, r1                    \n"   /*   mov a43, a53 to d3  */
+            "vmla.f32	q4, q2, d1[0]             \n"   /*   out00 += a03 * b3l  */
+            "ldr  r0, [%[b_ptr], #0x78]         \n"   /*    load b36 to r0     */
+            "vmla.f32	q6, q2, d1[1]             \n"   /*   out10 += a13 * b3l  */
+            "ldr  r1, [%[b_ptr], #0x7C]         \n"   /*    load b37 to r1     */
+            "vmla.f32	q8, q2, d2[0]             \n"   /*   out20 += a23 * b3l  */
+            "add  %[b_ptr], %[b_ptr], #0x80     \n"   /*      bptr += 108      */
+            "vldr d0, [%[a_ptr], #0x00]         \n"   /*  load a00, a10 to d0  */
+            "vmov d7, r0, r1                    \n"   /*   mov b36, b37 to d7  */
+            "vmla.f32	q10, q2, d2[1]            \n"   /*   out30 += a33 * b3l  */
+            "pld [%[b_ptr], #0xC0]              \n"   /*    pre load bpanel    */
+            "vmla.f32	q12, q2, d3[0]            \n"   /*   out40 += a43 * b3l  */
+            "vmla.f32	q14, q2, d3[1]            \n"   /*   out50 += a53 * b3l  */
+            "vldr d4, [%[b_ptr], #0x00]         \n"   /*  load b00, b01 to d4  */
+            "vmla.f32	q5, q3, d1[0]             \n"   /*   out01 += a03 * b3h  */
+            "ldr  r0, [%[b_ptr], #0x08]         \n"   /*    load b02 to r0     */
+            "vmla.f32	q7, q3, d1[1]             \n"   /*   out11 += a13 * b3h  */
+            "ldr  r1, [%[b_ptr], #0x0C]         \n"   /*    load b03 to r1     */
+            "vmla.f32	q9, q3, d2[0]             \n"   /*   out21 += a23 * b3h  */
+            "subs %[k], %[k], #1                \n"   /*      loop k -= 1      */ 
+            "vldr d1, [%[a_ptr], #0x08]         \n"   /*  load a20, a30 to d1  */
+            "vmov d5, r0, r1                    \n"   /*   mov b02, b03 to d5  */
+            "vmla.f32	q11, q3, d2[1]            \n"   /*   out31 += a33 * b3h  */
+            "ldr  r0, [%[a_ptr], #0x10]         \n"   /*    load a40 to r0     */
+            "vmla.f32	q13, q3, d3[0]            \n"   /*   out41 += a43 * b3h  */
+            "ldr  r1, [%[a_ptr], #0x14]         \n"   /*    load a50 to r1     */
+            "vmla.f32	q15, q3, d3[1]            \n"   /*   out51 += a53 * b3h  */
+            "bne  1b                            \n"   /*    branch to k loop   */
+            "6:\n"
+            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
+            "cmp  %[tails], #4                  \n"   /*     cmp tail with 4   */ 
+            "blt  3f                            \n"   /*   branch to tail == 1 */ 
+            /* Tail Unroll 0 */
+            "vmov d2, r0, r1                    \n"   /*   mov b02, b03 to d2  */
+            "add  %[a_ptr], %[a_ptr], #0x18     \n"   /*      aptr +=  24      */
+            "vmla.f32	q4, q2, d0[0]             \n"   /*   out00 += a00 * b0l  */    
+            "vld1.32  {d3}, [%[a_ptr] :64]!     \n"   /*  load a01, a11 to d3  */
+            "vmla.f32	q6, q2, d0[1]             \n"   /*   out10 += a10 * b0l  */   
+            "add  %[b_ptr], %[b_ptr], #0x10     \n"   /*      bptr +=  16      */
+            "vmla.f32	q8, q2, d1[0]             \n"   /*   out20 += a20 * b0l  */    
+            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b04-b07 to d6,d7 */
+            "vmla.f32	q10, q2, d1[1]            \n"   /*   out30 += a30 * b0l  */ 
+            "vmla.f32	q12, q2, d2[0]            \n"   /*   out40 += a40 * b0l  */ 
+            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
+            "vmla.f32	q14, q2, d2[1]            \n"   /*   out50 += a50 * b0l  */ 
+            "vld1.32	{d4-d5}, [%[b_ptr] :128]! \n"   /* load b10-b13 to d4,d5 */
+            "vmla.f32	q5, q3, d0[0]             \n"   /*   out01 += a00 * b0h  */   
+            "vmla.f32	q7, q3, d0[1]             \n"   /*   out11 += a10 * b0h  */   
+            "vmla.f32	q9, q3, d1[0]             \n"   /*   out21 += a20 * b0h  */   
+            "vmla.f32	q11, q3, d1[1]            \n"   /*   out31 += a30 * b0h  */   
+            "vld1.32	{d0-d1}, [%[a_ptr] :64]!  \n"   /* load a21-a51 to d0,d1 */
+            "cmp  %[tails], #4                  \n"   /*    cmp tail with 4    */
+            "vmla.f32	q13, q3, d2[0]            \n"   /*   out41 += a40 * b0h  */   
+            "vmla.f32	q15, q3, d2[1]            \n"   /*   out51 += a50 * b0h  */   
+            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b14-b17 to d6,d7 */   
+            "blt  4f                            \n"   /*   branch to tail == 2 */ 
+            /* Tail Unroll 1 */
+            "vmla.f32	q4, q2, d3[0]             \n"   /*   out00 += a01 * b1l  */    
+            "vmla.f32	q6, q2, d3[1]             \n"   /*   out10 += a11 * b1l  */    
+            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
+            "vmla.f32	q8, q2, d0[0]             \n"   /*   out20 += a21 * b1l  */  
+            "vmla.f32	q10, q2, d0[1]            \n"   /*   out30 += a31 * b1l  */  
+            "vmla.f32	q12, q2, d1[0]            \n"   /*   out40 += a41 * b1l  */  
+            "vmla.f32	q14, q2, d1[1]            \n"   /*   out50 += a51 * b1l  */  
+            "vld1.32	{d4-d5}, [%[b_ptr] :128]! \n"   /* load b20-b23 to d4,d5 */  
+            "vmla.f32	q5, q3, d3[0]             \n"   /*   out01 += a01 * b1h  */    
+            "vmla.f32	q7, q3, d3[1]             \n"   /*   out11 += a11 * b1h  */   
+            "cmp  %[tails], #4                  \n"   /*    cmp tail with 4    */
+            "vld1.32	{d2-d3}, [%[a_ptr] :64]!  \n"   /* load a02-a32 to d2,d3 */
+            "vmla.f32	q9, q3, d0[0]             \n"   /*   out21 += a21 * b1h  */ 
+            "vmla.f32	q11, q3, d0[1]            \n"   /*   out31 += a31 * b1h  */ 
+            "vmla.f32	q13, q3, d1[0]            \n"   /*   out41 += a41 * b1h  */ 
+            "vmla.f32	q15, q3, d1[1]            \n"   /*   out51 += a51 * b1h  */ 
+            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b24-b27 to d6,d7 */  
+            "blt  5f                            \n"   /*   branch to tail == 3 */ 
+            /* Tail Unroll 2 */
+            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
+            "vld1.32	{d0-d1}, [%[a_ptr] :64]!  \n"   /* a42a52a03a13 to d0,d1 */
+            "vmla.f32	q4, q2, d2[0]             \n"   /*   out00 += a02 * b2l  */    
+            "vmla.f32	q6, q2, d2[1]             \n"   /*   out10 += a12 * b2l  */    
+            "vmla.f32	q8, q2, d3[0]             \n"   /*   out20 += a22 * b2l  */
+            "vmla.f32	q10, q2, d3[1]            \n"   /*   out30 += a32 * b2l  */
+            "vmla.f32	q12, q2, d0[0]            \n"   /*   out40 += a42 * b2l  */
+            "vmla.f32	q14, q2, d0[1]            \n"   /*   out50 += a52 * b2l  */
+            "vld1.32	{d4-d5}, [%[b_ptr] :128]! \n"   /* load b30-b33 to d4,d5 */
+            "vmla.f32	q5, q3, d2[0]             \n"   /*   out01 += a02 * b2h  */  
+            "vmla.f32	q7, q3, d2[1]             \n"   /*   out11 += a12 * b2h  */  
+            "vmla.f32	q9, q3, d3[0]             \n"   /*   out21 += a22 * b2h  */  
+            "vmla.f32	q11, q3, d3[1]            \n"   /*   out31 += a32 * b2h  */  
+            "vld1.32	{d2-d3}, [%[a_ptr] :64]!  \n"   /* load a23-a53 to d2,d3 */
+            "vmla.f32	q13, q3, d0[0]            \n"   /*   out41 += a42 * b2h  */  
+            "vmla.f32	q15, q3, d0[1]            \n"   /*   out51 += a52 * b2h  */  
+            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b34-b37 to d6,d7 */
+            /* Tail Unroll 3 */
+            "vmla.f32	q4,  q2, d1[0]            \n"   /*   out00 += a03 * b3l  */    
+            "vmla.f32	q5,  q3, d1[0]            \n"   /*   out01 += a03 * b3h  */    
+            "vmla.f32	q6,  q2, d1[1]            \n"   /*   out10 += a13 * b3l  */    
+            "vmla.f32	q7,  q3, d1[1]            \n"   /*   out11 += a13 * b3h  */    
+            "vmla.f32	q8,  q2, d2[0]            \n"   /*   out20 += a23 * b3l  */    
+            "vmla.f32	q9,  q3, d2[0]            \n"   /*   out21 += a23 * b3h  */    
+            "vmla.f32	q10, q2, d2[1]            \n"   /*   out30 += a33 * b3l  */    
+            "vmla.f32	q11, q3, d2[1]            \n"   /*   out31 += a33 * b3h  */    
+            "vmla.f32	q12, q2, d3[0]            \n"   /*   out40 += a43 * b3l  */    
+            "vmla.f32	q13, q3, d3[0]            \n"   /*   out41 += a43 * b3h  */    
+            "vmla.f32	q14, q2, d3[1]            \n"   /*   out50 += a53 * b3l  */    
+            "vmla.f32	q15, q3, d3[1]            \n"   /*   out51 += a53 * b3h  */    
+            "b  2f                              \n"   /*  branch to check relu */
+            /* tails==1 final tail */
+            "3:\n"
+            "vmov d2, r0, r1                    \n"   /*   mov b02, b03 to d2  */
+            "add  %[b_ptr], %[b_ptr], #0x10     \n"   /*      bptr +=  16      */
+            "vmla.f32	q4, q2, d0[0]             \n"   /*   out00 += a00 * b0l  */  
+            "add  %[a_ptr], %[a_ptr], #0x18     \n"   /*      aptr +=  24      */
+            "vmla.f32	q6, q2, d0[1]             \n"   /*   out10 += a10 * b0l  */  
+            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b04-b07 to d6,d7 */
+            "vmla.f32	q8,  q2, d1[0]            \n"   /*   out20 += a20 * b0l  */   
+            "vmla.f32	q10, q2, d1[1]            \n"   /*   out30 += a30 * b0l  */   
+            "vmla.f32	q12, q2, d2[0]            \n"   /*   out40 += a40 * b0l  */
+            "vmla.f32	q14, q2, d2[1]            \n"   /*   out50 += a50 * b0l  */
+            "vmla.f32	q5,  q3, d0[0]            \n"   /*   out01 += a00 * b0h  */ 
+            "vmla.f32	q7,  q3, d0[1]            \n"   /*   out11 += a10 * b0h  */ 
+            "vmla.f32	q9,  q3, d1[0]            \n"   /*   out21 += a20 * b0h  */ 
+            "vmla.f32	q11, q3, d1[1]            \n"   /*   out31 += a30 * b0h  */   
+            "vmla.f32	q13, q3, d2[0]            \n"   /*   out41 += a40 * b0h  */ 
+            "vmla.f32	q15, q3, d2[1]            \n"   /*   out51 += a50 * b0h  */ 
+            "b  2f                              \n"   /*  branch to check relu */
+            /* tails==2 final tail */
+            "4:\n"
+            "vmla.f32	q4,  q2, d3[0]            \n"   /*   out00 += a01 * b1l  */ 
+            "vmla.f32	q5,  q3, d3[0]            \n"   /*   out01 += a01 * b1h  */ 
+            "vmla.f32	q6,  q2, d3[1]            \n"   /*   out10 += a11 * b1l  */ 
+            "vmla.f32	q7,  q3, d3[1]            \n"   /*   out11 += a11 * b1h  */ 
+            "vmla.f32	q8,  q2, d0[0]            \n"   /*   out20 += a21 * b1l  */ 
+            "vmla.f32	q9,  q3, d0[0]            \n"   /*   out21 += a21 * b1h  */ 
+            "vmla.f32	q10, q2, d0[1]            \n"   /*   out30 += a31 * b1l  */ 
+            "vmla.f32	q11, q3, d0[1]            \n"   /*   out31 += a31 * b1h  */ 
+            "vmla.f32	q12, q2, d1[0]            \n"   /*   out40 += a41 * b1l  */ 
+            "vmla.f32	q13, q3, d1[0]            \n"   /*   out41 += a41 * b1h  */ 
+            "vmla.f32	q14, q2, d1[1]            \n"   /*   out50 += a51 * b1l  */ 
+            "vmla.f32	q15, q3, d1[1]            \n"   /*   out51 += a51 * b1h  */ 
+            "b  2f                              \n"   /*  branch to check relu */
+            /* tails==3 final tail */
+            "5:\n"
+            "vmla.f32	q4,  q2, d2[0]            \n"   /*   out00 += a02 * b2l  */ 
+            "vld1.32	{d0}, [%[a_ptr] :64]!     \n"   /*  load a42, a52 to d0  */
+            "vmla.f32	q6,  q2, d2[1]            \n"   /*   out10 += a12 * b2l  */ 
+            "vmla.f32	q8,  q2, d3[0]            \n"   /*   out20 += a22 * b2l  */ 
+            "vmla.f32	q5,  q3, d2[0]            \n"   /*   out01 += a02 * b2h  */ 
+            "vmla.f32	q7,  q3, d2[1]            \n"   /*   out11 += a12 * b2h  */
+            "vmla.f32	q9,  q3, d3[0]            \n"   /*   out21 += a22 * b2h  */
+            "vmla.f32	q10, q2, d3[1]            \n"   /*   out30 += a32 * b2l  */ 
+            "vmla.f32	q11, q3, d3[1]            \n"   /*   out31 += a32 * b2h  */
+            "vmla.f32	q12, q2, d0[0]            \n"   /*   out40 += a42 * b2l  */ 
+            "vmla.f32	q13, q3, d0[0]            \n"   /*   out41 += a42 * b2h  */
+            "vmla.f32	q14, q2, d0[1]            \n"   /*   out50 += a52 * b2l  */ 
+            "vmla.f32	q15, q3, d0[1]            \n"   /*   out51 += a52 * b2h  */
+            /* relu */
+            "2:\n"
+            "cmp %[tails], #1                   \n"   /*    cmp tail is relu   */
+            "bne 0f                             \n"   /* no relu branch to end */
+            "vmov.i32 q0,  #0                   \n"   /*     mov 0.f to q0     */
+            "vmax.f32 q4,  q4,  q0              \n"   /*      out00 relu       */
+            "vmax.f32 q5,  q5,  q0              \n"   /*      out01 relu       */
+            "vmax.f32 q6,  q6,  q0              \n"   /*      out10 relu       */
+            "vmax.f32 q7,  q7,  q0              \n"   /*      out11 relu       */
+            "vmax.f32 q8,  q8,  q0              \n"   /*      out20 relu       */
+            "vmax.f32 q9,  q9,  q0              \n"   /*      out21 relu       */
+            "vmax.f32 q10, q10, q0              \n"   /*      out30 relu       */
+            "vmax.f32 q11, q11, q0              \n"   /*      out31 relu       */
+            "vmax.f32 q12, q12, q0              \n"   /*      out40 relu       */
+            "vmax.f32 q13, q13, q0              \n"   /*      out41 relu       */
+            "vmax.f32 q14, q14, q0              \n"   /*      out50 relu       */
+            "vmax.f32 q15, q15, q0              \n"   /*      out51 relu       */
+            "0:\n"
+            "vst1.32  {d8-d11},   [%[c_ptr0]]!  \n"   /*  store out0 to cptr0  */
+            "vst1.32  {d12-d15},  [%[c_ptr1]]!  \n"   /*  store out1 to cptr1  */
+            "vst1.32  {d16-d19},  [%[c_ptr2]]!  \n"   /*  store out2 to cptr2  */
+            "vst1.32  {d20-d23},  [%[c_ptr3]]!  \n"   /*  store out3 to cptr3  */
+            "vst1.32  {d24-d27},  [%[c_ptr4]]!  \n"   /*  store out4 to cptr4  */
+            "vst1.32  {d28-d31},  [%[c_ptr5]]!  \n"   /*  store out5 to cptr5  */
+            : [a_ptr] "+r"(a_ptr),
+              [b_ptr] "+r"(b_ptr),
+              [c_ptr0] "+r"(c_ptr0),
+              [c_ptr1] "+r"(c_ptr1),
+              [c_ptr2] "+r"(c_ptr2),
+              [c_ptr3] "+r"(c_ptr3),
+              [c_ptr4] "+r"(c_ptr4),
+              [c_ptr5] "+r"(c_ptr5),
+              [k] "+r"(k),
+              [tails] "+r"(tails)
+            : [bias_ptr] "r"(bias_local)
+            : "r0", "r1", "q0","q1","q2","q3","q4",
+              "q5","q6","q7","q8","q9","q10","q11",
+              "q12","q13","q14","q15","cc","memory");
+        // clang-format on
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          for (int i = 0; i < remain; ++i) {
+            *pout0++ = cout0[i];
+            *pout1++ = cout1[i];
+            *pout2++ = cout2[i];
+            *pout3++ = cout3[i];
+            *pout4++ = cout4[i];
+            *pout5++ = cout5[i];
+          }
+        }
+      }
+    }
+  }
+}
+
 void sgemm_prepacked_4x8(bool is_transB,
                          int M,
                          int N,
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index 0955b09d92f64066000b03c4487f359880f1c2a5..fdcbc7394b1be9e438686f91dfa407065d24f91a 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -21,6 +21,17 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+
+int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
 void pooling_basic(const float* din,
                    float* dout,
                    int num,
@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
 #pragma omp parallel for
       for (int ind_c = 0; ind_c < chin; ++ind_c) {
         for (int ind_h = 0; ind_h < hout; ++ind_h) {
-          int sh = ind_h * stride_h;
-          int eh = sh + kernel_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          int sh, eh;
+          if (adaptive) {
+            sh = AdaptStartIndex(ind_h, hin, hout);
+            eh = AdaptEndIndex(ind_h, hin, hout);
+          } else {
+            sh = ind_h * stride_h;
+            eh = sh + kernel_h;
+            sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+            eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          }
           for (int ind_w = 0; ind_w < wout; ++ind_w) {
-            int sw = ind_w * stride_w;
-            int ew = sw + kernel_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > win ? win : ew - pad_w;
+            int sw, ew;
+            if (adaptive) {
+              sw = AdaptStartIndex(ind_w, win, wout);
+              ew = AdaptEndIndex(ind_w, win, wout);
+            } else {
+              sw = ind_w * stride_w;
+              ew = sw + kernel_w;
+              sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+              ew = (ew - pad_w) > win ? win : ew - pad_w;
+            }
             float result = static_cast<float>(0);
             int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
                           ind_h * wout + ind_w;
diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc
index 5aad98c05c56f85931b7a0276d0a85b426573c4c..aab1058b9dd66522a0793fc151c54707505d1fbb 100644
--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
@@ -27,31 +27,467 @@ void scale<float>(
   int remain = num % 16;
   float32x4_t vscale = vdupq_n_f32(scale);
   float32x4_t vbias = vdupq_n_f32(bias);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                       \n"
+        "ld1  {v4.4s}, [%[din]], #16              \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v5.4s}, [%[din]], #16              \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v6.4s}, [%[din]], #16              \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v7.4s}, [%[din]], #16              \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b \n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s          \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s          \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s         \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s         \n"
+
+        "stp  q8, q9, [%[dout]], #32              \n"
+        "subs %w[cnt], %w[cnt],  #1               \n"
+        "stp  q10, q11, [%[dout]], #32            \n"
+
+        "bne    1b                                \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias)
+        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+#else
+    asm volatile(
+        "1:                                      @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias)
+        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale_relu<float>(
+    const float* din, float* dout, int num, float scale, float bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                      \n"
+        "ld1  {v4.4s}, [%[din]], #16             \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v5.4s}, [%[din]], #16             \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v6.4s}, [%[din]], #16             \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b\n"
+        "ld1  {v7.4s}, [%[din]], #16             \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b\n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s       \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s       \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s      \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s      \n"
+
+        "fmax v8.4s, v8.4s, %[vzero].4s        \n"
+        "fmax v9.4s, v9.4s, %[vzero].4s        \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s      \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s      \n"
+
+        "stp  q8, q9, [%[dout]], #32           \n"
+        "subs %w[cnt], %w[cnt], #1             \n"
+        "stp  q10, q11, [%[dout]], #32         \n"
+        "bne    1b                             \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero)
+        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+#else
+    asm volatile(
+        "1:                                     @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vmax.f32 q8, q8, %q[vzero]             @ relu \n"
+        "vmax.f32 q9, q9, %q[vzero]             @ relu \n"
+        "vmax.f32 q10, q10, %q[vzero]           @ relu \n"
+        "vmax.f32 q11, q11, %q[vzero]           @ relu \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero)
+        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      *dout = *dout > 0.f ? *dout : 0.f;
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale_relu6<float>(const float* din,
+                        float* dout,
+                        int num,
+                        float scale,
+                        float bias,
+                        float alpha) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t valpha = vdupq_n_f32(alpha);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                       \n"
+        "ld1  {v4.4s}, [%[din]], #16              \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v5.4s}, [%[din]], #16              \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v6.4s}, [%[din]], #16              \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v7.4s}, [%[din]], #16              \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b \n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s       \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s       \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s      \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s      \n"
+
+        "fmax v8.4s, v8.4s, %[vzero].4s        \n"
+        "fmax v9.4s, v9.4s, %[vzero].4s        \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s      \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s      \n"
+
+        "fmin v8.4s, v8.4s, %[valpha].4s       \n"
+        "fmin v9.4s, v9.4s, %[valpha].4s       \n"
+        "fmin v10.4s, v10.4s, %[valpha].4s     \n"
+        "fmin v11.4s, v11.4s, %[valpha].4s     \n"
+
+        "stp  q8, q9, [%[dout]], #32           \n"
+        "subs %w[cnt], %w[cnt], #1             \n"
+        "stp  q10, q11, [%[dout]], #32         \n"
+        "bne    1b                             \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+#else
+    asm volatile(
+        "1:                                     @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vmax.f32 q8, q8, %q[vzero]             @ relu \n"
+        "vmax.f32 q9, q9, %q[vzero]             @ relu \n"
+        "vmax.f32 q10, q10, %q[vzero]           @ relu \n"
+        "vmax.f32 q11, q11, %q[vzero]           @ relu \n"
+
+        "vmin.f32 q8, q8, %q[valpha]             @ relu \n"
+        "vmin.f32 q9, q9, %q[valpha]             @ relu \n"
+        "vmin.f32 q10, q10, %q[valpha]           @ relu \n"
+        "vmin.f32 q11, q11, %q[valpha]           @ relu \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      *dout = *dout > 0.f ? (*dout < alpha ? *dout : alpha) : 0.f;
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale_leaky_relu<float>(const float* din,
+                             float* dout,
+                             int num,
+                             float scale,
+                             float bias,
+                             float alpha) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t valpha = vdupq_n_f32(alpha);
+  if (cnt > 0) {
+#ifdef __aarch64__
+    asm volatile(
+        "1:                                       \n"
+        "ld1  {v4.4s}, [%[din]], #16              \n"
+        "and  v8.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v5.4s}, [%[din]], #16              \n"
+        "and  v9.16b, %[vbias].16b, %[vbias].16b  \n"
+        "ld1  {v6.4s}, [%[din]], #16              \n"
+        "and  v10.16b, %[vbias].16b, %[vbias].16b \n"
+        "ld1  {v7.4s}, [%[din]], #16              \n"
+        "and  v11.16b, %[vbias].16b, %[vbias].16b \n"
+
+        "fmla v8.4s, v4.4s, %[vscale].4s       \n"
+        "fmla v9.4s, v5.4s, %[vscale].4s       \n"
+        "fmla v10.4s, v6.4s, %[vscale].4s      \n"
+        "fmla v11.4s, v7.4s, %[vscale].4s      \n"
+
+        "fcmge v12.4s, v8.4s, %[vzero].4s       \n"
+        "fmul v16.4s, v8.4s, %[valpha].4s       \n"
+
+        "fcmge v13.4s, v9.4s, %[vzero].4s       \n"
+        "fmul v17.4s, v9.4s, %[valpha].4s        \n"
+
+        "fcmge v14.4s, v10.4s, %[vzero].4s      \n"
+        "fmul v18.4s, v10.4s, %[valpha].4s      \n"
+
+        "fcmge v15.4s, v11.4s, %[vzero].4s      \n"
+        "fmul v19.4s, v11.4s, %[valpha].4s      \n"
+
+        "bif  v8.16b, v16.16b, v12.16b \n"  /* choose*/
+        "bif  v9.16b, v17.16b, v13.16b \n"  /* choose*/
+        "bif  v10.16b, v18.16b, v14.16b \n" /* choose*/
+        "bif  v11.16b, v19.16b, v15.16b \n" /* choose*/
+
+        "stp  q8, q9, [%[dout]], #32           \n"
+        "subs %w[cnt], %w[cnt], #1             \n"
+        "stp  q10, q11, [%[dout]], #32         \n"
+        "bne    1b                             \n"
+        "0:   \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc",
+          "memory",
+          "v4",
+          "v5",
+          "v6",
+          "v7",
+          "v8",
+          "v9",
+          "v10",
+          "v11",
+          "v12",
+          "v13",
+          "v14",
+          "v15");
+#else
+    asm volatile(
+        "1:                                     @ loop header \n"
+        "vld1.32  {d8-d11}, [%[din]]!           @ load din 0 \n"
+        "vand.32 q8, %q[vbias], %q[vbias]       @ out bias \n"
+        "vand.32 q9, %q[vbias], %q[vbias]       @ out bias \n"
+        "vld1.32  {d12-d15}, [%[din]]!          @ load din 0 \n"
+
+        "vand.32 q10, %q[vbias], %q[vbias]      @ out bias \n"
+        "vand.32 q11, %q[vbias], %q[vbias]      @ out bias \n"
+
+        "vmla.f32 q8, q4, %q[vscale]            @ mla \n"
+        "vmla.f32 q9, q5, %q[vscale]            @ mla \n"
+        "vmla.f32 q10, q6, %q[vscale]           @ mla \n"
+        "vmla.f32 q11, q7, %q[vscale]           @ mla \n"
+
+        "vcge.f32 q12, q8, %q[vzero]             @ relu \n"
+        "vmul.f32 q14, q8, %q[valpha]            @ mul \n"
+        "vcge.f32 q13, q9, %q[vzero]             @ relu \n"
+        "vmul.f32 q15, q9, %q[valpha]            @ mul \n"
+        "vbif q8, q14, q12                       @ choose \n"
+        "vbif q9, q15, q13                      @ choose \n"
+
+        "vcge.f32 q12, q10, %q[vzero]             @ relu \n"
+        "vmul.f32 q14, q10, %q[valpha]            @ mul \n"
+        "vcge.f32 q13, q11, %q[vzero]             @ relu \n"
+        "vmul.f32 q15, q11, %q[valpha]            @ mul \n"
+
+        "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
+
+        "vbif q10, q14, q12                       @ choose \n"
+        "vbif q11, q15, q13                      @ choose \n"
+        "subs %[cnt], #1                        @ loop count minus 1\n"
+        "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
+
+        "bne    1b                              @ jump to main loop start "
+        "2: \n"
+        : [dout] "+r"(dout), [din] "+r"(din), [cnt] "+r"(cnt)
+        : [vscale] "w"(vscale),
+          [vbias] "w"(vbias),
+          [vzero] "w"(vzero),
+          [valpha] "w"(valpha)
+        : "cc",
+          "memory",
+          "q4",
+          "q5",
+          "q6",
+          "q7",
+          "q8",
+          "q9",
+          "q10",
+          "q11",
+          "q12",
+          "q13",
+          "q14",
+          "q15");
+#endif
+  }
+  if (remain > 0) {
+    for (int i = 0; i < remain; i++) {
+      *dout = *din * scale + bias;
+      *dout = *dout > 0.f ? *dout : (*dout * alpha);
+      dout++;
+      din++;
+    }
+  }
+}
+
+template <>
+void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
+  }
+  if (remain > 0) {
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+template <>
+void scale_relu<int>(const int* din, int* dout, int num, int scale, int bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+  int32x4_t vzero = vdupq_n_s32(0);
 #pragma omp parallel for
   for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
 
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
 
-    float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-    float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-    float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-    float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
 
-    vst1q_f32(dout_ptr, vsum1);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    vst1q_f32(dout_ptr + 8, vsum3);
-    vst1q_f32(dout_ptr + 12, vsum4);
+    vsum1 = vmaxq_s32(vsum1, vzero);
+    vsum2 = vmaxq_s32(vsum2, vzero);
+    vsum3 = vmaxq_s32(vsum3, vzero);
+    vsum4 = vmaxq_s32(vsum4, vzero);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
   }
   if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
     for (int i = 0; i < remain; i++) {
       *dout_ptr = *din_ptr * scale + bias;
+      *dout_ptr = *dout_ptr > 0 ? *dout_ptr : 0;
       dout_ptr++;
       din_ptr++;
     }
@@ -59,11 +495,66 @@ void scale<float>(
 }
 
 template <>
-void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
+void scale_relu6<int>(
+    const int* din, int* dout, int num, int scale, int bias, int alpha) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+  int32x4_t vzero = vdupq_n_s32(0);
+  int32x4_t valpha = vdupq_n_s32(alpha);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
+
+    vsum1 = vmaxq_s32(vsum1, vzero);
+    vsum2 = vmaxq_s32(vsum2, vzero);
+    vsum3 = vmaxq_s32(vsum3, vzero);
+    vsum4 = vmaxq_s32(vsum4, vzero);
+
+    vsum1 = vminq_s32(vsum1, valpha);
+    vsum2 = vminq_s32(vsum2, valpha);
+    vsum3 = vminq_s32(vsum3, valpha);
+    vsum4 = vminq_s32(vsum4, valpha);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
+  }
+
+  if (remain > 0) {
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      *dout_ptr = *dout_ptr > 0 ? (*dout_ptr > alpha ? alpha : *dout_ptr) : 0;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+template <>
+void scale_leaky_relu<int>(
+    const int* din, int* dout, int num, int scale, int bias, int alpha) {
   int cnt = num >> 4;
   int remain = num % 16;
   int32x4_t vscale = vdupq_n_s32(scale);
   int32x4_t vbias = vdupq_n_s32(bias);
+  int32x4_t vzero = vdupq_n_s32(0);
+  int32x4_t valpha = vdupq_n_s32(alpha);
 #pragma omp parallel for
   for (int i = 0; i < cnt; i++) {
     const int* din_ptr = din + (i << 4);
@@ -79,16 +570,33 @@ void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
     int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
     int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
 
+    uint32x4_t v1 = vcgeq_s32(vsum1, vzero);
+    uint32x4_t v2 = vcgeq_s32(vsum2, vzero);
+    uint32x4_t v3 = vcgeq_s32(vsum3, vzero);
+    uint32x4_t v4 = vcgeq_s32(vsum4, vzero);
+
+    int32x4_t v11 = vmulq_s32(vsum1, valpha);
+    int32x4_t v21 = vmulq_s32(vsum1, valpha);
+    int32x4_t v31 = vmulq_s32(vsum1, valpha);
+    int32x4_t v41 = vmulq_s32(vsum1, valpha);
+
+    vsum1 = vbslq_s32(v1, vsum1, v11);
+    vsum2 = vbslq_s32(v2, vsum2, v21);
+    vsum3 = vbslq_s32(v3, vsum3, v31);
+    vsum4 = vbslq_s32(v4, vsum4, v41);
+
     vst1q_s32(dout_ptr, vsum1);
     vst1q_s32(dout_ptr + 4, vsum2);
     vst1q_s32(dout_ptr + 8, vsum3);
     vst1q_s32(dout_ptr + 12, vsum4);
   }
+
   if (remain > 0) {
     const int* din_ptr = din + (cnt << 4);
     int* dout_ptr = dout + (cnt << 4);
     for (int i = 0; i < remain; i++) {
       *dout_ptr = *din_ptr * scale + bias;
+      *dout_ptr = *dout_ptr > 0 ? *dout_ptr : (*dout_ptr) * alpha;
       dout_ptr++;
       din_ptr++;
     }
diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h
index 910bea5613997c05e9257507f8f84792e0071a53..bbdb596bc8f45c247a24f9833680c8a510c1e904 100644
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
 template <typename T>
 void scale(const T* din, T* dout, int num, T scale, T bias);
 
+template <typename T>
+void scale_relu(const T* din, T* dout, int num, T scale, T bias);
+
+template <typename T>
+void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
+
+template <typename T>
+void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
+
 template <typename T>
 void scale(const T* din,
            T* dout,
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 41059a0d42a95bbffed4c41611b9f3b8ac60861c..06e6c7ee46d8b839873d433843f0035e3963664c 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,6 +28,7 @@ namespace lite {
 class CLContext {
  public:
   ~CLContext() {
+    GetCommandQueue().finish();
     for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
       // Note(ysh329): Don't need `clReleaseKernel`
       kernels_[kidx].reset();
diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc
index ba32d8c803bfd832289a936fe9150ba8d14cd984..17c879269cb745481cd2b474833e71f7417e7bad 100644
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
   size_t width = in_image.ImageWidth();
   size_t height = in_image.ImageHeight();
   auto global_work_size = cl::NDRange{width, height};
-  cl::Event event;
   status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
   CL_CHECK_FATAL(status);
   status = context->GetCommandQueue().finish();
   CL_CHECK_FATAL(status);
+#if 0
   double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
   double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
   double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
   LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+#endif
+
   LOG(INFO) << out_image;
 }
 
diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc
index 7e6f83a4d12f82c780b8e2a8ba582d6a13d8dc07..2cfcc5dc81576973ef20fc0855131472ec2c0977 100644
--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
             i2 += 4;
             p++;
           } else {
-            image[i2] = 0.0;
+            image[i2] = Float2Half(0.f);
             i2 += 4;
           }
         }
@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
             image[index] = Float2Half(*p);
             p++;
           } else {
-            image[index] = 0.0;
+            image[index] = Float2Half(0.f);
           }
           if (index >= (width * height * 4)) {
             LOG(INFO) << " index out of range ";
diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h
index 582e6a08b16ea7b5b8edd5850b1c9af04db56aad..b427eb70d6cdbb5cd495e970fb77c4790bc01723 100644
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 /////////////////////////////////
@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
 #endif
 
 #ifdef RELU6
-  output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
+  in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
 #endif
   return output;
 }
diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
index 08491d5d9fd195430a4b03673c38767f7e4a5be8..a4070f747aec43f7a0ed097f9b15186cafd32476 100644
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -14,36 +14,30 @@ limitations under the License. */
 
 #include <cl_common.h>
 
-
 __kernel void relu(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
                    __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
 
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   in = max((CL_DTYPE4)(0.0f), in);
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 
-
 __kernel void relu6(__read_only image2d_t input,
                     __write_only image2d_t output,
                     __private const float threshold,
-                   __private const float scale){
-
+                    __private const float scale) {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 
-
 __kernel void sigmoid(__read_only image2d_t input,
                       __write_only image2d_t output,
                       __private const float threshold,
@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input,
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   CL_DTYPE4 out;
-  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
-  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
-  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+
+  out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
+  out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
+  out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
+  out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
 
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
 
 __kernel void leaky_relu(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const float threshold,
-                      __private const float scale) {
+                         __write_only image2d_t output,
+                         __private const float threshold,
+                         __private const float scale) {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
-  if (in.x < 0.0f){
+  if (in.x < 0.0f) {
     in.x = s_val.x;
   }
-  if (in.y < 0.0f){
+  if (in.y < 0.0f) {
     in.y = s_val.y;
   }
-  if (in.z < 0.0f){
+  if (in.z < 0.0f) {
     in.z = s_val.z;
   }
-  if (in.w < 0.0f){
+  if (in.w < 0.0f) {
     in.w = s_val.w;
   }
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 
 __kernel void tanh_act(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const float threshold,
-                      __private const float scale) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+                       __write_only image2d_t output,
+                       __private const float threshold,
+                       __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
+  CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
 
 __kernel void exp_act(__read_only image2d_t input,
                       __write_only image2d_t output,
                       __private const float threshold,
-                   __private const float scale) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   CL_DTYPE4 out = exp(in);
@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input,
 }
 
 __kernel void swish(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const float threshold,
-                   __private const float scale) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+                    __write_only image2d_t output,
+                    __private const float threshold,
+                    __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
-
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
index 4b2d5ba32072e7eb31adbf347360e0bbcee7bc5b..1c808da68ddc923e12234bc4b6ac99b35bfffb0b 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -1,28 +1,29 @@
 #include <cl_common.h>
 
-__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-                         __read_only image2d_t input_image,
-                         __read_only image2d_t filter,
+__kernel void conv2d_1x1_opt(
+    __private const int global_size_dim0,
+    __private const int global_size_dim1,
+    __private const int global_size_dim2,
+    __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                         __read_only image2d_t new_biase,
+    __read_only image2d_t new_scale,
+    __read_only image2d_t new_biase,
 #endif
-                         __write_only image2d_t output_image,
-                         __private const int stride,
-                         __private const int offset,
-                         __private const int input_c_block,
-                         __private const int input_c_origin,
-                         __private const int dilation,
-                         __private const int input_width,  /* of one block */
-                         __private const int input_height, /* of one block */
-                         __private const int output_width,
-                         __private const int output_height,
-                         __private const int old_w) {
+    __write_only image2d_t output_image,
+    __private const int stride,
+    __private const int offset,
+    __private const int input_c_block,
+    __private const int input_c_origin,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w) {
 
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
     __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
     __read_only image2d_t new_biase,
 #endif
     __write_only image2d_t output_image,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
index 8d7950d6b897df833ada56e2de5be7c6203de9ea..771765ea6063a08784ae824a757b28450d808f6d 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -27,402 +27,509 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                          __private const int offset,
                          __private const int input_c,
                          __private const int dilation,
-                         __private const int input_width,/* of one block */
-                         __private const int input_height,/* of one block */
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
                          __private const int output_width,
                          __private const int output_height,
                          __private const int output_c,
                          __private const int filter_channel,
-						 __private const int filter_width,
-						 __private const int filter_height,
-                         __private const int group) {
+                         __private const int filter_width,
+                         __private const int filter_height,
+                         __private const int group,
+                         __private const int input_tensor_c
 
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
+) {
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
 
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
 
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
 
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 
 #ifdef BIASE_CH
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
 #else
-    CL_DTYPE4 output = 0.0f;
+  CL_DTYPE4 output = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
 
-    CL_DTYPE4 input[9]; // 3x3 region of input
-    if (group == 1) {
-        for (int i = 0; i < input_c; ++i) { // each run for 3x3
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-
-            input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (CL_DTYPE4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y - dilation)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-                int j = 0;
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y += 3;
-                CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y += 3;
-                CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y += 3;
-                CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 1;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 2;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 3;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 4;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 5;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-               j = 6;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 7;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 8;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
+  CL_DTYPE4 input[9];  // 3x3 region of input
+  if (group == 1) {
+    for (int i = 0; i < input_c; ++i) {  // each run for 3x3
+      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                           in_pos_in_one_block.y);
+
+      input[0] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                     in_pos_in_one_block.y - dilation < 0 ||
+                     in_pos_in_one_block.x - dilation >= input_width ||
+                     in_pos_in_one_block.y - dilation >= input_height)
+                    << 15));
+
+      input[1] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
+                               (int2)(pos_in.x, pos_in.y - dilation)),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[2] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                     in_pos_in_one_block.y - dilation < 0 ||
+                     in_pos_in_one_block.x + dilation >= input_width ||
+                     in_pos_in_one_block.y - dilation >= input_height)
+                    << 15));
+
+      input[3] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
+                               (int2)(pos_in.x - dilation, pos_in.y)),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[4] = select(
+          READ_IMG_TYPE(
+              CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                     in_pos_in_one_block.x >= input_width ||
+                     in_pos_in_one_block.y >= input_height)
+                    << 15));
+
+      input[5] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[6] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                     in_pos_in_one_block.y + dilation < 0 ||
+                     in_pos_in_one_block.x - dilation >= input_width ||
+                     in_pos_in_one_block.y + dilation >= input_height)
+                    << 15));
+
+      input[7] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
+                               (int2)(pos_in.x, pos_in.y + dilation)),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[8] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                     in_pos_in_one_block.y + dilation < 0 ||
+                     in_pos_in_one_block.x + dilation >= input_width ||
+                     in_pos_in_one_block.y + dilation >= input_height)
+                    << 15));
+
+      if (i == input_c - 1) {
+        int c_shr = input_tensor_c % 4;
+        if (c_shr == 1) {
+          for (int k = 0; k < 9; k++) {
+            input[k].y = (half)0.f;
+            input[k].z = (half)0.f;
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 2) {
+          for (int k = 0; k < 9; k++) {
+            input[k].z = (half)0.f;
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 3) {
+          for (int k = 0; k < 9; k++) {
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 0) {
         }
-    } else { // group != 1
-      for (int i = 0; i < 4; i++) {
-        int used_input_channel_num =
+      }
+
+      int j = 0;
+      int2 pos_of_weight;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      CL_DTYPE4 weight_x =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y += 3;
+      CL_DTYPE4 weight_y =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y += 3;
+      CL_DTYPE4 weight_z =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y += 3;
+      CL_DTYPE4 weight_w =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 1;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 2;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 3;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 4;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 5;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 6;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 7;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 8;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+    }
+  } else {  // group != 1
+    for (int i = 0; i < 4; i++) {
+      int used_input_channel_num =
           (out_c * 4 + i) / (output_c / group) * filter_channel;
-        for (int f_c = 0; f_c < filter_channel; ++f_c) {
-          int input_c = used_input_channel_num + f_c;
-          int input_block = input_c / 4;
-          int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
-                               in_pos_in_one_block.y);
-          input[0] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      for (int f_c = 0; f_c < filter_channel; ++f_c) {
+        int input_c = used_input_channel_num + f_c;
+        int input_block = input_c / 4;
+        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                             in_pos_in_one_block.y);
+        input[0] = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                           (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-              (CL_DTYPE4)(0.0f),
-              (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                         in_pos_in_one_block.y - dilation < 0 ||
-                         in_pos_in_one_block.x - dilation >= input_width ||
-                         in_pos_in_one_block.y - dilation >= input_height)
-                        << 15));
-          input[1] =
-              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[1] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                  (int2)(pos_in.x, pos_in.y - dilation)),
-                     (CL_DTYPE4)(0.0f),
-                     (ushort4)((in_pos_in_one_block.x < 0 ||
-                                in_pos_in_one_block.y - dilation < 0 ||
-                                in_pos_in_one_block.x >= input_width ||
-                                in_pos_in_one_block.y - dilation >= input_height)
-                               << 15));
-          input[2] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y - dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y - dilation >= input_height)
+                             << 15));
+        input[2] = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                           (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (CL_DTYPE4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                         in_pos_in_one_block.y - dilation < 0 ||
-                         in_pos_in_one_block.x + dilation >= input_width ||
-                         in_pos_in_one_block.y - dilation >= input_height)
-                        << 15));
-          input[3] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y)),
-                          (CL_DTYPE4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                                     in_pos_in_one_block.y < 0 ||
-                                     in_pos_in_one_block.x - dilation >= input_width ||
-                                     in_pos_in_one_block.y >= input_height)
-                                    << 15));
-          input[4] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
-                          (CL_DTYPE4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
-                                     in_pos_in_one_block.x >= input_width ||
-                                     in_pos_in_one_block.y >= input_height)
-                                     << 15));
-          input[5] =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
-                               (int2)(pos_in.x + dilation, pos_in.y)),
-                   (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[3] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
+                                 (int2)(pos_in.x - dilation, pos_in.y)),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x - dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[4] = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                       in_pos_in_one_block.x >= input_width ||
+                       in_pos_in_one_block.y >= input_height)
+                      << 15));
+        input[5] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
+                                 (int2)(pos_in.x + dilation, pos_in.y)),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                    (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                               in_pos_in_one_block.y < 0 ||
                               in_pos_in_one_block.x + dilation >= input_width ||
                               in_pos_in_one_block.y >= input_height)
                              << 15));
-          input[6] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+        input[6] = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                           (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (CL_DTYPE4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                                     in_pos_in_one_block.y + dilation < 0 ||
-                                     in_pos_in_one_block.x - dilation >= input_width ||
-                                     in_pos_in_one_block.y + dilation >= input_height)
-                                     << 15));
-          input[7] =
-              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+        input[7] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                  (int2)(pos_in.x, pos_in.y + dilation)),
-                     (CL_DTYPE4)(0.0f),
-                     (ushort4)((in_pos_in_one_block.x < 0 ||
-                                in_pos_in_one_block.y + dilation < 0 ||
-                                in_pos_in_one_block.x >= input_width ||
-                                in_pos_in_one_block.y + dilation >= input_height)
-                                 << 15));
-          input[8] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y + dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y + dilation >= input_height)
+                             << 15));
+        input[8] = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (CL_DTYPE4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                                     in_pos_in_one_block.y + dilation < 0 ||
-                                     in_pos_in_one_block.x + dilation >= input_width ||
-                                     in_pos_in_one_block.y + dilation >= input_height)
-                                      << 15));
-
-          CL_DTYPE tmp_out = 0;
-          for (int j = 0; j < 9; j++) {
-            int2 pos_of_weight;
-            pos_of_weight.x = (f_c / 4) * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-            CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
-
-            int f_c_offset = f_c % 4;
-            CL_DTYPE f_value;
-            if (f_c_offset == 0) {
-              f_value = weight.x;
-            } else if (f_c_offset == 1) {
-              f_value = weight.y;
-            } else if (f_c_offset == 2) {
-              f_value = weight.z;
-            } else if (f_c_offset == 3) {
-              f_value = weight.w;
-            }
-
-            int input_c_offset = input_c % 4;
-            CL_DTYPE input_value;
-            if (input_c_offset == 0) {
-              input_value = input[j].x;
-            } else if (input_c_offset == 1) {
-              input_value = input[j].y;
-            } else if (input_c_offset == 2) {
-              input_value = input[j].z;
-            } else if (input_c_offset == 3) {
-              input_value = input[j].w;
-            }
-            tmp_out += f_value * input_value;
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+
+        CL_DTYPE tmp_out = 0;
+        for (int j = 0; j < 9; j++) {
+          int2 pos_of_weight;
+          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+          CL_DTYPE4 weight =
+              READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+
+          int f_c_offset = f_c % 4;
+          CL_DTYPE f_value;
+          if (f_c_offset == 0) {
+            f_value = weight.x;
+          } else if (f_c_offset == 1) {
+            f_value = weight.y;
+          } else if (f_c_offset == 2) {
+            f_value = weight.z;
+          } else if (f_c_offset == 3) {
+            f_value = weight.w;
           }
 
-          if (i == 0) {
-            output.x += tmp_out;
-          } else if (i == 1) {
-            output.y += tmp_out;
-          } else if (i == 2) {
-            output.z += tmp_out;
-          } else if (i == 3) {
-            output.w += tmp_out;
+          int input_c_offset = input_c % 4;
+          CL_DTYPE input_value;
+          if (input_c_offset == 0) {
+            input_value = input[j].x;
+          } else if (input_c_offset == 1) {
+            input_value = input[j].y;
+          } else if (input_c_offset == 2) {
+            input_value = input[j].z;
+          } else if (input_c_offset == 3) {
+            input_value = input[j].w;
           }
+          tmp_out += f_value * input_value;
+        }
+
+        if (i == 0) {
+          output.x += tmp_out;
+        } else if (i == 1) {
+          output.y += tmp_out;
+        } else if (i == 2) {
+          output.z += tmp_out;
+        } else if (i == 3) {
+          output.w += tmp_out;
         }
       }
     }
+  }
 
-	output = activation_type4(output);
+  output = activation_type4(output);
 
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 6ab2b59343f09c1284ec21a7913f67c26707301c..5626fe6be7d451d4ffe22a2008affa7d82298bc3 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -12,288 +12,375 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <cl_common.h>
 
-__kernel void depth_conv2d_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv2d_3x3(
+    __private const int global_size_dim0,
+    __private const int global_size_dim1,
+    __private const int global_size_dim2,
+    __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int dilation,
-                                              __private const int input_c,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height, /* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
+    __write_only image2d_t output_image,
+    __private const int stride,
+    __private const int offset,
+    __private const int dilation,
+    __private const int input_c,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height) {
 
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
 
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    const int batch_index = out_nh / output_height;
+  const int batch_index = out_nh / output_height;
 
-    const int out_nh_in_one_batch = out_nh % output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
 
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
 
-    int2 stride_xy = (int2)(stride, stride);
-    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
-
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
 #else
-    CL_DTYPE4 output = 0.0f;
+  CL_DTYPE4 output = 0.0f;
 #endif
 
-    const int filter_width = 3;
-    const int filter_height = 3;
-
-    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
-
-    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
-
-    int filter_x = pos_in_filter_block.x ;
-    int filter_y = pos_in_filter_block.y ;
-
-    CL_DTYPE4 inputs[9];
-
-        inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-        /*
-        if (output_pos.x == 112 && output_pos.y == 0) {
-              CL_DTYPE4 input1 = inputs[3];
-              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-              printf(" input4 3 - %v4hlf \n", in);
-              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-        }
-        */
-
-
-        inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (CL_DTYPE4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-    CL_DTYPE4 filters[9];
-    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
-    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    for(int i = 0 ;i < 9 ; i++){
-     output += inputs[i] * filters[i];
-    }
-
-    output = activation_type4(output);
-
-
-    /*
-
-    if (output_pos.x == 112 && output_pos.y == 0) {
-
-        for (int i = 0; i < 9; ++i) {
-            CL_DTYPE4 input1 = inputs[i];
-            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-            printf(" input4 %d - %v4hlf \n", i, in);
-        }
-
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" depth wise output output4 = %v4hlf \n", out);
-        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
-        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
-        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
-        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
-    }
-
-    */
-
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
-
+  const int filter_width = 3;
+  const int filter_height = 3;
+
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+
+  CL_DTYPE4 inputs[9];
+
+  inputs[0] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                 in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+
+  inputs[1] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+
+  inputs[2] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                 in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+
+  inputs[3] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+        CL_DTYPE4 input1 = inputs[3];
+        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+        printf(" input4 3 - %v4hlf \n", in);
+        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+  }
+  */
+
+  inputs[4] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                           pos_in_input_block.y + in_pos_in_one_block.y)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[5] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[6] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                 in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+
+  inputs[7] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+
+  inputs[8] = select(
+      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                    input,
+                    sampler,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (CL_DTYPE4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                 in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+
+  CL_DTYPE4 filters[9];
+  filters[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
+  filters[3] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
+  filters[6] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
+
+  for (int i = 0; i < 9; i++) {
+    output += inputs[i] * filters[i];
+  }
+
+  output = activation_type4(output);
+
+  /*
+
+  if (output_pos.x == 112 && output_pos.y == 0) {
+
+      for (int i = 0; i < 9; ++i) {
+          CL_DTYPE4 input1 = inputs[i];
+          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+          printf(" input4 %d - %v4hlf \n", i, in);
+      }
+
+      float4 out = (float4)(output.x, output.y, output.z, output.w);
+      printf(" depth wise output output4 = %v4hlf \n", out);
+      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+      printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+      printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+      printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+  }
+
+  */
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
 }
 
-
-
 __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
-                                              __private const int ou_w_blk,
-                                              __private const int ou_nh,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+                                 __private const int ou_w_blk,
+                                 __private const int ou_nh,
+                                 __read_only image2d_t input,
+                                 __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+                                 __read_only image2d_t bias,
 #endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int pad,
-                                              __private const int dilation,
-                                              __private const int in_ch,
-                                              __private const int in_w,/* of one block */
-                                              __private const int in_h, /* of one block */
-                                              __private const int ou_w,
-                                              __private const int ou_h) {
-
-    const int ou_ch_blk_id = get_global_id(0);
-    const int ou_w_blk_id = get_global_id(1);
-    const int ou_nh_id = get_global_id(2);
-    const int w_blk_size = 2;
-
-    const int batch_id = ou_nh_id / ou_h;
-    int ou_col_id = ou_w_blk_id * w_blk_size;
-    int ou_row_id = ou_nh_id % ou_h;
-    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
-
-    // input pos in one block and on batch
-    int col_id = ou_col_id - pad;
-    int row_id = ou_row_id - pad;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+                                 __write_only image2d_t output_image,
+                                 __private const int stride,
+                                 __private const int pad,
+                                 __private const int dilation,
+                                 __private const int in_ch,
+                                 __private const int in_w, /* of one block */
+                                 __private const int in_h, /* of one block */
+                                 __private const int ou_w,
+                                 __private const int ou_h) {
+
+  const int ou_ch_blk_id = get_global_id(0);
+  const int ou_w_blk_id = get_global_id(1);
+  const int ou_nh_id = get_global_id(2);
+  const int w_blk_size = 2;
+
+  const int batch_id = ou_nh_id / ou_h;
+  int ou_col_id = ou_w_blk_id * w_blk_size;
+  int ou_row_id = ou_nh_id % ou_h;
+  int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
+
+  // input pos in one block and on batch
+  int col_id = ou_col_id - pad;
+  int row_id = ou_row_id - pad;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 #ifdef BIASE_CH
-    CL_DTYPE4 output[2];
-    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
-    output[1] = output[0];
+  CL_DTYPE4 output[2];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
+  output[1] = output[0];
 #elif defined(BIASE_ELE)
-    CL_DTYPE4 output[2];
-    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
-    }
+  CL_DTYPE4 output[2];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
+  if (ou_col_id + 1 < ou_w) {
+    output[1] =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+  }
 #else
-    CL_DTYPE4 output[2] = {0.0f};
+  CL_DTYPE4 output[2] = {0.0f};
 #endif
 
-    CL_DTYPE4 inputs[12];
-
-    int filter_x = ou_ch_blk_id * 3;
-    int filter_y = 0;
-    CL_DTYPE4 filters[9];
-    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
-
-    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
-    int in_y = mad24(batch_id, in_h, row_id);
-
-    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
-    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
-    inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0));
-    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
-    inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0));
-    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
-    inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0));
-    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
-    inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0));
-
-    output[0] = mad(inputs[0], filters[0], output[0]);
-    output[1] = mad(inputs[1], filters[0], output[1]);
-
-    output[0] = mad(inputs[1], filters[1], output[0]);
-    output[1] = mad(inputs[2], filters[1], output[1]);
-
-    output[0] = mad(inputs[2], filters[2], output[0]);
-    output[1] = mad(inputs[3], filters[2], output[1]);
-
-
-    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-
-
-    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
-    inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
-    inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1));
-    inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
-    inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
-
-
-    output[0] = mad(inputs[4], filters[3], output[0]);
-    output[1] = mad(inputs[5], filters[3], output[1]);
-
-    output[0] = mad(inputs[5], filters[4], output[0]);
-    output[1] = mad(inputs[6], filters[4], output[1]);
-
-    output[0] = mad(inputs[6], filters[5], output[0]);
-    output[1] = mad(inputs[7], filters[5], output[1]);
-
-
-    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
-    inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
-    inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2));
-    inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
-    inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
-
-
-    output[0] = mad(inputs[8], filters[6], output[0]);
-    output[1] = mad(inputs[9], filters[6], output[1]);
-
-    output[0] = mad(inputs[9], filters[7], output[0]);
-    output[1] = mad(inputs[10], filters[7], output[1]);
-
-    output[0] = mad(inputs[10], filters[8], output[0]);
-    output[1] = mad(inputs[11], filters[8], output[1]);
-
-    output[0] = activation_type4(output[0]);
-    output[1] = activation_type4(output[1]);
-
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
-    if (ou_col_id + 1 < ou_w) {
-        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
-    }
-
+  CL_DTYPE4 inputs[12];
+
+  int filter_x = ou_ch_blk_id * 3;
+  int filter_y = 0;
+  CL_DTYPE4 filters[9];
+  filters[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
+
+  int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+  int in_y = mad24(batch_id, in_h, row_id);
+
+  int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+  int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+  inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0));
+  int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+  inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0));
+  int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+  inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0));
+  int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+  inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0));
+
+  output[0] = mad(inputs[0], filters[0], output[0]);
+  output[1] = mad(inputs[1], filters[0], output[1]);
+
+  output[0] = mad(inputs[1], filters[1], output[0]);
+  output[1] = mad(inputs[2], filters[1], output[1]);
+
+  output[0] = mad(inputs[2], filters[2], output[0]);
+  output[1] = mad(inputs[3], filters[2], output[1]);
+
+  filters[3] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
+
+  int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+  inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
+  inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1));
+  inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
+  inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
+
+  output[0] = mad(inputs[4], filters[3], output[0]);
+  output[1] = mad(inputs[5], filters[3], output[1]);
+
+  output[0] = mad(inputs[5], filters[4], output[0]);
+  output[1] = mad(inputs[6], filters[4], output[1]);
+
+  output[0] = mad(inputs[6], filters[5], output[0]);
+  output[1] = mad(inputs[7], filters[5], output[1]);
+
+  filters[6] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = READ_IMG_TYPE(
+      CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
+
+  int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+  inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
+  inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2));
+  inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
+  inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
+
+  output[0] = mad(inputs[8], filters[6], output[0]);
+  output[1] = mad(inputs[9], filters[6], output[1]);
+
+  output[0] = mad(inputs[9], filters[7], output[0]);
+  output[1] = mad(inputs[10], filters[7], output[1]);
+
+  output[0] = mad(inputs[10], filters[8], output[0]);
+  output[1] = mad(inputs[11], filters[8], output[1]);
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+
+  WRITE_IMG_TYPE(
+      CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
+  if (ou_col_id + 1 < ou_w) {
+    WRITE_IMG_TYPE(
+        CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+  }
 }
-
diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
index 6c419fe3c134614d28b3bcee3eabac5e8f7bdf6e..4c90981eb97f864b2c7ffa3b01e61b23aa4444de 100644
--- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -18,7 +18,7 @@ limitations under the License. */
 ////////////////////////////////////////////////////////
 // buffer -> image2d
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d(__global CL_DTYPE *in,
+__kernel void buffer_to_image2d(__global CL_DTYPE* in,
                                 __write_only image2d_t output_image,
                                 __private const int out_H,
                                 __private const int out_W,
@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
                                 __private const int Stride0,
                                 __private const int Stride1,
                                 __private const int Stride2) {
-
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
 
 #ifdef DEBUG
   if (out_w > 2045) {
-    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
-		   out_w,
-           out_C - 4 * out_c,
-           (float)(in[input_pos0]),
-           (float)(in[input_pos1]),
-           (float)(in[input_pos2]),
-           (float)(in[input_pos3]));
-    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
-           output_pos.x, output_pos.y,
-           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
+    printf(
+        "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
+        "%.2f\n",
+        out_w,
+        out_C - 4 * out_c,
+        (float)(in[input_pos0]),
+        (float)(in[input_pos1]),
+        (float)(in[input_pos2]),
+        (float)(in[input_pos3]));
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
+           out_c,
+           out_w,
+           out_nh,
+           output_pos.x,
+           output_pos.y,
+           (float)(output.x),
+           (float)(output.y),
+           (float)(output.z),
+           (float)(output.w));
   }
 #endif
 
@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
   const int in_h = in_nh % in_height;
 
   const sampler_t sampler =
-    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
+      CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
 
 #ifdef DEBUG
   if (in_w > 2045) {
-    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
-            pos_x, in_nh,
-           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
+           in_c,
+           in_w,
+           in_nh,
+           pos_x,
+           in_nh,
+           (float)(in.x),
+           (float)(in.y),
+           (float)(in.z),
+           (float)(in.w));
   }
 #endif
 
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
   out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
   if (C - 4 * in_c >= 2) {
     out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
   }
-  if(C - 4 * in_c >= 3) {
+  if (C - 4 * in_c >= 3) {
     out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
   }
-  if(C - 4 * in_c >= 4) {
+  if (C - 4 * in_c >= 4) {
     out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
   }
 }
 
-
-#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
+#if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 ////////////////////////////////////////////////////////
 // buffer -> image2d_nw
 ////////////////////////////////////////////////////////
@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
 }
 #endif
 
-
-#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
+#if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 // image2d -> buffer
 __kernel void image2d_to_buffer_2d(__private const int in_height,
                                    __private const int in_width,
@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
 ////////////////////////////////////////////////////////
 // buffer -> image2d (divide by 255 to normalize)
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+__kernel void buffer_to_image2d_with_pre255(__global uchar* in,
                                             __write_only image2d_t output_image,
                                             __private const int out_H,
                                             __private const int out_W,
                                             __private const int out_C,
                                             __private const int Stride0,
                                             __private const int Stride1,
-                                            __private const int Stride2){
-
+                                            __private const int Stride2) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
   const int in_h = out_h;
   const int in_w = out_w;
 
-
   int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
   int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
   int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
 
   CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
   output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
-  if(out_C - 4 * out_c>=2){
-      output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
+  if (out_C - 4 * out_c >= 2) {
+    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
   }
-  if(out_C - 4 * out_c>=3){
-      output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
+  if (out_C - 4 * out_c >= 3) {
+    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
   }
-  if(out_C - 4 * out_c>=4){
-      output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
+  if (out_C - 4 * out_c >= 4) {
+    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
   }
   WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
 }
 
-
 ////////////////////////////////////////////////////////
 // image2d -> buffer (multiply by 255 to de-normalize)
 ////////////////////////////////////////////////////////
 __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
-                                            __private const int in_width,
-                                            __private const int in_height,
-                                            __global uchar* out,
-                                            __private const int size_ch,
-                                            __private const int size_block,
-                                            __private const int size_batch,
-                                            __private const int C) {
+                                             __private const int in_width,
+                                             __private const int in_height,
+                                             __global uchar* out,
+                                             __private const int size_ch,
+                                             __private const int size_block,
+                                             __private const int size_batch,
+                                             __private const int C) {
   const int in_c = get_global_id(0);
   const int in_w = get_global_id(1);
   const int in_nh = get_global_id(2);
@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
       CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
+  CL_COMPUTE_DTYPE4 in =
+      READ_IMG_TYPE(
+          CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
+      255;
 
 #ifdef DEBUG
   printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
-          in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
+         in_c,
+         in_w,
+         in_nh,
+         pos_x,
+         in_nh,
+         in.x,
+         in.y,
+         in.z,
+         in.w);
 #endif
 
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
   out[index] = convert_uchar_sat(in.x);
-  if(C - 4 * in_c>=2){
+  if (C - 4 * in_c >= 2) {
     out[index + size_ch] = convert_uchar_sat(in.y);
   }
-  if(C - 4 * in_c>=3){
+  if (C - 4 * in_c >= 3) {
     out[index + size_ch * 2] = convert_uchar_sat(in.z);
   }
-  if(C - 4 * in_c>=4){
+  if (C - 4 * in_c >= 4) {
     out[index + size_ch * 3] = convert_uchar_sat(in.w);
   }
 }
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index d5b2d70b09a84cb405c0e7c8f2b55f4254eb7f64..c074768a64671076c364f528f62a54bcc104c90e 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
   bool is_device_init = InitializeDevice();
   is_init_success_ = is_platform_init && is_device_init;
   initialized_ = true;
+
+  context_ = CreateContext();
+  command_queue_ = CreateCommandQueue(context());
   return initialized_;
 }
 
@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
 
 cl::Context& CLRuntime::context() {
   if (context_ == nullptr) {
-    context_ = CreateContext();
+    LOG(FATAL) << "context_ create failed. ";
   }
   return *context_;
 }
@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
 
 cl::CommandQueue& CLRuntime::command_queue() {
   if (command_queue_ == nullptr) {
-    command_queue_ = CreateCommandQueue(context());
+    LOG(FATAL) << "command_queue_ create failed. ";
   }
   return *command_queue_;
 }
@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
   /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
-  std::string build_option = options + " -cl-fast-relaxed-math ";
+  std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
   VLOG(4) << "OpenCL build_option: " << build_option;
   status_ = program->build({*device_}, build_option.c_str());
   CL_CHECK_ERROR(status_);
diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc
index 9cf07dfc0c474b0b5c57b8355c099eba15610a91..950f2fc442bdbbbb843ea6b15f0c2eac23c2e690 100644
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                  size_t size,
                                  IoDirection dir) {
   cl_int status;
-  cl::Event event;
   auto stream = CLRuntime::Global()->command_queue();
   switch (dir) {
     case IoDirection::DtoD:
@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                         0,
                                         size,
                                         nullptr,
-                                        &event);
+                                        nullptr);
       CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
       break;
     case IoDirection::HtoD:
       status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
   cl::array<size_t, 3> origin = {0, 0, 0};
   cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
   cl_int status;
-  cl::Event event;
   auto stream = CLRuntime::Global()->command_queue();
   switch (dir) {
     case IoDirection::DtoD:
@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
                                        origin,
                                        region,
                                        nullptr,
-                                       &event);
+                                       nullptr);
       CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
       break;
     case IoDirection::HtoD:
       status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index 05a10b5a19fbc8e80ee6dd07e67154d9cf6d1b22..cb1781db2199c1b7a12aaec80b1904f65b23b534 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
     T* output_data = output->template mutable_data<T>();
     for (int64_t i = 0; i < in_dims[0]; ++i) {
       for (int64_t j = 0; j < size; ++j) {
-        output_data[i * in_dims[0] + j] =
-            input_data[i * in_dims[0] + j] + vector_data[j];
+        output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
       }
     }
   }
diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc
index acb377e31ccac96547fc4f0644332cfad36d66bc..fe7a46f9f04d49ea7b505b8e2ece6b4bdd0ec826 100644
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
       }
     }
     if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
+      VLOG(3) << "no input has value! just return";
       return;
     }
     auto input_width = has_value_input->value().dims()[1];
diff --git a/lite/core/context.cc b/lite/core/context.cc
index be41aa6eb0cb986760f38eaa2bb5b7e017cc4edb..711c67f8b7f36edcd2d66569d964296d96e8d85c 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -19,6 +19,7 @@ namespace lite {
 
 #ifdef LITE_WITH_XPU
 thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
 #endif
 
 }  // namespace lite
diff --git a/lite/core/context.h b/lite/core/context.h
index bacb570a903d807945cb9e2a8b98615fcaba9384..d0c1bd93cc7b93628aedc5f549c84d19c44f4f71 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -151,14 +151,23 @@ class Context<TargetType::kXPU> {
     if (_tls_raw_ctx == nullptr) {
       _tls_raw_ctx = xdnn::create_context();
       CHECK(_tls_raw_ctx);
+      int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
+                                          _workspace_l3_size_per_thread);
+      if (r != 0) {
+        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
+                     << ", _workspace_l3_size_per_thread = "
+                     << _workspace_l3_size_per_thread;
+      }
     }
     return _tls_raw_ctx;
   }
 
   static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
-    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+    _workspace_l3_size_per_thread = l3_size;
   }
 
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
   static void SetDev(int dev_no = 0) {
     const char* dev_env = getenv("LITE_XPU_DEV");
     if (dev_env) {
@@ -173,6 +182,7 @@ class Context<TargetType::kXPU> {
 
  private:
   static thread_local xdnn::Context* _tls_raw_ctx;
+  static int _workspace_l3_size_per_thread;
 };
 #endif
 
@@ -340,27 +350,17 @@ class Context<TargetType::kX86> {
 template <>
 class Context<TargetType::kOpenCL> {
   std::shared_ptr<CLContext> cl_context_;
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
-  std::shared_ptr<WaitListType> cl_wait_list_;
 
  public:
   CLContext* cl_context() { return cl_context_.get(); }
-  WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
 
   void InitOnce() {
     // Init cl runtime.
     CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
-
     cl_context_ = std::make_shared<CLContext>();
-    cl_wait_list_ = std::make_shared<WaitListType>();
   }
 
-  void CopySharedTo(OpenCLContext* ctx) {
-    ctx->cl_context_ = cl_context_;
-    ctx->cl_wait_list_ = cl_wait_list_;
-  }
+  void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
 };
 #endif
 
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index d036bf7988b98e64586e42683d33b4696e9ff706..b8234b18922f454c41e295209da13de024184adc 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/scale_activation_fuse_pass.cc
       fusion/__xpu__resnet_fuse_pass.cc
       fusion/__xpu__multi_encoder_fuse_pass.cc
+      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+      fusion/__xpu__fc_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
+      elimination/identity_dropout_eliminate_pass.cc
       elimination/elementwise_mul_constant_eliminate_pass.cc
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
diff --git a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92401df875da1f500ec09b34b2786d15cea2991b
--- /dev/null
+++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace {
+
+class Eliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    // the previous op's output need updat
+    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
+    // TODO(Superjomn) check has only one output
+    auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
+    auto* dropout_op = OpNode("dropout", "dropout")
+                           ->assert_op_attr<int>("is_test", 1)
+                           ->assert_op_attr<std::string>(
+                               "dropout_implementation", "upscale_in_train");
+    auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
+    auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
+
+    *pre_op >> *x >> *dropout_op >> *out;
+    *dropout_op >> *mask;
+
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    x->AsIntermediate();  // x is pre_op's output, need to update
+    dropout_op->AsIntermediate();
+    mask->AsIntermediate();
+  }
+
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& pre_op = matched.at("preop")->AsStmt();
+    auto op_info = *pre_op.op_info();
+
+    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
+                             matched.at("out")->AsArg().name);
+    pre_op.ResetOp(op_info, graph->valid_places());
+
+    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
+  }
+};
+
+}  // namespace
+
+class IdentityDropoutEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    Eliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
+                  paddle::lite::mir::IdentityDropoutEliminatePass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index 04a36976c7110c64ef781af12fc86fd4853fe583..a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
 lite_cc_library(fuse_sequence_pool_concat
         SRCS sequence_pool_concat_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scale_activation
+        SRCS scale_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 
 set(mir_fusers
     fuse_fc
@@ -44,6 +47,7 @@ set(mir_fusers
     fuse_transpose_softmax_transpose
     fuse_interpolate
     fuse_sequence_pool_concat
+    fuse_scale_activation
     CACHE INTERNAL "fusers")
 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
diff --git a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1272ae4c63c2521bf738ca8623fcde2d40014dea
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
@@ -0,0 +1,166 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace fusion {
+
+class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
+ public:
+  explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
+      : n_embedding_(n_embedding) {}
+
+  void BuildPattern() override {
+    auto* ids0 =
+        VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table0 =
+        VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding0 = OpNode("embedding0", "lookup_table");
+    auto* embedding_out0 = VarNode("embedding_out0")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "X")
+                               ->AsIntermediate();
+
+    auto* ids1 =
+        VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table1 =
+        VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
+    auto* embedding_out1 = VarNode("embedding_out1")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "Y")
+                               ->AsIntermediate();
+
+    auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
+    auto* ewadd01_out = VarNode("ewadd01_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->AsIntermediate();
+
+    embedding0->LinksFrom({ids0, table0});
+    embedding0->LinksTo({embedding_out0});
+    embedding1->LinksFrom({ids1, table1});
+    embedding1->LinksTo({embedding_out1});
+    ewadd01->LinksFrom({embedding_out0, embedding_out1});
+    ewadd01->LinksTo({ewadd01_out});
+
+    auto* last_ewadd_out = ewadd01_out;
+    for (int i = 2; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      auto embedding_name = paddle::lite::string_format("embedding%d", i);
+      auto embedding_out_name =
+          paddle::lite::string_format("embedding_out%d", i);
+
+      auto* new_ids = VarNode(ids_name)
+                          ->assert_is_op_input("lookup_table", "Ids")
+                          ->AsInput();
+      auto* new_table = VarNode(table_name)
+                            ->assert_is_op_input("lookup_table", "W")
+                            ->AsInput();
+      auto* new_embedding =
+          OpNode(embedding_name, "lookup_table")->AsIntermediate();
+      auto* new_embedding_out = VarNode(embedding_out_name)
+                                    ->assert_is_op_output("lookup_table", "Out")
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsIntermediate();
+
+      new_embedding->LinksFrom({new_ids, new_table});
+      new_embedding->LinksTo({new_embedding_out});
+
+      auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
+      auto ewadd_out_name = ewadd_name + "_out";
+
+      auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
+      auto* new_ewadd_out = VarNode(ewadd_out_name)
+                                ->assert_is_op_output("elementwise_add", "Out")
+                                ->AsIntermediate();
+
+      new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
+      new_ewadd->LinksTo({new_ewadd_out});
+      last_ewadd_out = new_ewadd_out;
+    }
+    last_ewadd_out->AsOutput();
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__embedding_with_eltwise_add");
+    std::vector<std::string> ids_names;
+    std::vector<std::string> table_names;
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      ids_names.push_back(matched.at(ids_name)->arg()->name);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      table_names.push_back(matched.at(table_name)->arg()->name);
+    }
+    op_desc.SetInput("Ids", ids_names);
+    op_desc.SetInput("Tables", table_names);
+    auto output_name = paddle::lite::string_format(
+        "ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
+    op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
+    op_desc.SetAttr<int>("n_embedding", n_embedding_);
+    auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
+    op_desc.SetAttr<int64_t>(
+        "padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
+
+    auto* new_stmt = matched.at("embedding0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      DirectedLink(matched.at(ids_name), matched.at("embedding0"));
+      DirectedLink(matched.at(table_name), matched.at("embedding0"));
+    }
+    IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
+  }
+
+ private:
+  int n_embedding_;
+};
+
+}  // namespace fusion
+
+class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    for (int n_embedding : {4, 3}) {
+      fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
+      fuser(graph.get());
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
+                  paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("lookup_table");
diff --git a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e6b28790e1c87f2e9e80acc99f3cf517621c477
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUFcFuser : public FuseBase {
+ public:
+  explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
+
+  void BuildPattern() override {
+    // create nodes.
+    auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+    auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+    auto* b = VarNode("b")->assert_is_persistable_var();
+    auto* mul = OpNode("mul", "mul");
+    auto* mul_out = VarNode("mul_out");
+    auto* add = OpNode("add", "elementwise_add");
+    auto* Out = VarNode("Out");
+
+    // create topology.
+    std::vector<PMNode*> mul_inputs{W, x};
+    std::vector<PMNode*> add_inputs{mul_out, b};
+    mul_inputs >> *mul >> *mul_out;
+
+    // Some op specialities.
+    mul_out->AsIntermediate();
+    mul->AsIntermediate();
+    add->AsIntermediate();
+
+    if (with_relu_) {
+      auto* add_out = VarNode("add_out");
+      auto* relu = OpNode("relu", "relu");
+      std::vector<PMNode*> relu_inputs{add_out};
+      add_inputs >> *add >> *add_out;
+      relu_inputs >> *relu >> *Out;
+      add_out->AsIntermediate();
+      relu->AsIntermediate();
+    } else {
+      add_inputs >> *add >> *Out;
+    }
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto mul = matched.at("mul")->stmt()->op();
+    auto* scope = mul->scope();
+
+    // convert W from float to int16, and transpose W
+    auto weight_name = matched.at("W")->arg()->name;
+    auto* weight_t = scope->FindMutableTensor(weight_name);
+    auto weight_dims = weight_t->dims();
+    int weight_len = weight_t->numel();
+    float* weight_on_host = weight_t->mutable_data<float>();
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+
+    std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+    std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        weight_on_host, weight_int16.get(), max_f, weight_len);
+    paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                       weight_trans_int16.get(),
+                                       weight_dims[0],
+                                       weight_dims[1]);
+    memcpy(
+        weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
+
+    auto op_desc = GenOpDesc(matched, max_f, true);
+    auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
+    auto& valid_places = mul->valid_places();
+    fc_op->Attach(op_desc, scope);
+
+    auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+
+    IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+  }
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
+                        float w_max,
+                        bool transpose_w) {
+    cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__fc");
+    op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+    op_desc.SetInput("W", {matched.at("W")->arg()->name});
+    op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+    op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+    op_desc.SetAttr(
+        "in_num_col_dims",
+        matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+    op_desc.SetAttr("w_max", w_max);
+    op_desc.SetAttr("transpose_w", transpose_w);
+    if (with_relu_) {
+      op_desc.SetAttr("activation_type", std::string{"relu"});
+    }
+    return op_desc;
+  }
+
+  bool with_relu_;
+};
+
+}  // namespace fusion
+
+class XPUFcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUFcFuser fuser(true /* with_relu */);
+    fuser(graph.get());
+
+    fusion::XPUFcFuser fuser2(false /* with_relu */);
+    fuser2(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("fc");
diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
index 655274070f1ffcccf39b5f3ff6aaa705c5cbbfda..a6640f107f5dd46e6570a55cf59d2ad69a2bee1a 100644
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "lite/backends/xpu/math.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/type_precision_cast_pass.h"  // For UpdateInputs()
 #include "lite/core/mir/xpu_pattern_matcher_high_api.h"
 #include "lite/operators/subgraph_op.h"
 
@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
     multi_encoder_stmt->SetOp(multi_encoder_op);
     multi_encoder_stmt->SetKernels(std::move(kernels));
 
-    // temp remove useless cast
-    std::unordered_set<const Node*> to_remove2;
+    // remove dangling/useless cast
     Node* stack = nullptr;
     for (auto* node : graph->StmtTopologicalOrder()) {
       CHECK(node->IsStmt());
@@ -597,16 +597,39 @@ class XPUMultiEncoderFuser {
         stack = node;
       }
     }
-    Node* stack_out = stack->outlinks.front();
-    for (Node* cast : stack_out->outlinks) {
-      Node* cast_out = cast->outlinks.front();
-      if (cast_out->outlinks.size() == 0) {
-        // remove
-        to_remove2.insert(cast_out);
-        to_remove2.insert(cast);
+    if (stack) {
+      std::unordered_set<const Node*> to_remove2;
+      Node* stack_out = stack->outlinks.front();
+      // avoid modification while traversing
+      auto stack_out_outlinks = stack_out->outlinks;
+      for (Node* cast : stack_out_outlinks) {
+        if (cast->stmt()->op_info()->Type() != "cast") {
+          continue;
+        }
+
+        Node* cast_out = cast->outlinks.front();
+        if (cast_out->outlinks.size() == 0) {
+          // dangling cast
+          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
+        } else if (cast_out->outlinks.size() == 1) {
+          // useless cast
+          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
+
+          auto* multi_encoder = cast_out->outlinks.front();
+          DirectedLink(stack_out, multi_encoder);
+          UpdateInputs(multi_encoder->stmt()->op().get(),
+                       cast_out->arg()->name,
+                       stack_out->arg()->name);
+          auto update_op_info = *multi_encoder->stmt()->op_info();
+          multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
+        }
       }
+      GraphSafeRemoveNodes(graph, to_remove2);
     }
-    GraphSafeRemoveNodes(graph, to_remove2);
   }
 };
 
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 143a7cecce8c1c45ada9ad31e8e4bea5447fec68..6718356788d46e24752204c3586cd8447cbbfaaa 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
   auto conv_weight_t =
       scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
+  auto groups = conv_op_desc->GetAttr<int>("groups");
+  bool depthwise = false;
   if (conv_type_ == "conv2d_transpose") {
+    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
     CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-             static_cast<size_t>(conv_weight_t->dims()[1]))
+             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
         << "The BN bias's size should be equal to the size of the first "
         << "dim size of the conv weights";
   } else {
@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
     // compute new conv_weight for int8
     auto weight_scale =
         conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
       int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                    conv_weight_t->dims()[3];
       int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   } else {
     // compute new conv_weight
     auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
       int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                    conv_weight_t->dims()[3];
       int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.cc b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ad1f4994f6d5183d3b5c925bb222cb95ea064e8
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
+    fusion::ScaleActivationFuser fuser(act_type);
+    fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
+                  paddle::lite::mir::ScaleActivationFusePass)
+    .BindTargets({TARGET(kARM)})
+    .BindKernel("scale");
diff --git a/lite/core/mir/fusion/scale_activation_fuse_pass.h b/lite/core/mir/fusion/scale_activation_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..2118a0b6f396ff12855009a975059c95ee6111a8
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ScaleActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/scale_activation_fuser.cc b/lite/core/mir/fusion/scale_activation_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f18099da8bc97d9dab8f9c31fd6c23d42d67d81
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ScaleActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
+
+  // create op nodes
+  auto* scale =
+      OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+
+  // create intermediate nodes
+  auto* scale_out = VarNode("scale_out")
+                        ->assert_is_op_output("scale", "Out")
+                        ->assert_is_op_input(act_type_, "X")
+                        ->AsIntermediate();
+
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+  // create topology.
+  *x >> *scale >> *scale_out;
+  *scale_out >> *act >> *out;
+}
+
+void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
+                                         const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto scale_op = LiteOpRegistry::Global().Create("scale");
+  auto scale = matched.at("scale")->stmt()->op();
+  auto* scope = scale->scope();
+  auto& valid_places = scale->valid_places();
+  scale_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+
+  op_desc.SetAttr("activation_type", act_type_);
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("alpha", alpha);
+  } else if (act_type_ == "leaky_relu") {
+    float alpha = act_op_desc.GetAttr<float>("alpha");
+    op_desc.SetAttr("alpha", alpha);
+  }
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/scale_activation_fuser.h b/lite/core/mir/fusion/scale_activation_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fa9b9d2b5ebc5091b41a2ca244689797c97ccb6
--- /dev/null
+++ b/lite/core/mir/fusion/scale_activation_fuser.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ScaleActivationFuser : public FuseBase {
+ public:
+  explicit ScaleActivationFuser(const std::string& act_type) {
+    act_type_ = act_type;
+  }
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index 941a9e9f88cf04ef47487237b1a3f6509dea762b..de76f404f8a129eb94e645dc731a0d09c1ee3c77 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -25,16 +25,16 @@ namespace lite {
 bool OpLite::InferShape() {
   // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
   // InferShapeByMemoryInternal will be applied.
-  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+  if (op_param_ && op_param_->input_tensor_ptrs() &&
+      op_param_->output_tensor_ptrs()) {
     return this->InferShapeWithCache();
   } else {
-    // otherwise, InferShapeImpl is applied directly.
     return this->InferShapeImpl();
   }
 }
 bool OpLite::InferShapeWithCache() {
   // 1. Get vector of current input tensors
-  auto *current_inputs = param_.input_tensor_ptrs();
+  auto *current_inputs = op_param_->input_tensor_ptrs();
   // 2. Get hash value of current inputs shape and lod
   size_t new_hash = 0;
   for (auto iter = current_inputs->begin(); iter != current_inputs->end();
@@ -59,7 +59,7 @@ bool OpLite::InferShapeWithCache() {
   if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
     // if current hash value is consistent with io_shape_lod_hash_,
     // previous outputs shape and lod are reused.
-    auto *current_outputs = param_.output_tensor_ptrs();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
     for (size_t i = 0; i < current_outputs->size(); i++) {
       current_outputs->at(i)->Resize(last_output_shapes[i]);
       current_outputs->at(i)->set_lod(last_output_lods[i]);
@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
     // otherwise, current hash value is changed, InferShapeImpl will apply.
     io_shape_lod_hash_ = new_hash;
     this->InferShapeImpl();
-    auto *current_outputs = param_.output_tensor_ptrs();
+    auto *current_outputs = op_param_->output_tensor_ptrs();
+    last_output_shapes.clear();
+    last_output_lods.clear();
     for (size_t i = 0; i < current_outputs->size(); i++) {
-      last_output_shapes[i] = current_outputs->at(i)->dims();
-      last_output_lods[i] = current_outputs->at(i)->lod();
+      last_output_shapes.push_back(current_outputs->at(i)->dims());
+      last_output_lods.push_back(current_outputs->at(i)->lod());
     }
   }
   return true;
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 428b188c468ded790e74c9cc4f5da5c7efe2fd00..656f992b1736d88abd1ed95759b19519ec11aff7 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -77,6 +77,11 @@ class OpLite : public Registry {
   // Link the external execution environ to internal context.
   bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
 
+  template <typename T>
+  inline void AttachParam(T *param) {
+    op_param_ = static_cast<T *>(param);
+  }
+
   const OpInfo *op_info() const { return op_info_.get(); }
   OpInfo *mutable_op_info() { return op_info_.get(); }
 
@@ -167,11 +172,10 @@ class OpLite : public Registry {
   std::vector<Place> valid_places_;
   Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
   std::unique_ptr<OpInfo> op_info_;
-
   std::vector<DDimLite> last_output_shapes{};
   std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
   size_t io_shape_lod_hash_{};
-  mutable operators::ParamBase param_;
+  mutable operators::ParamBase *op_param_{nullptr};
 
  private:
   // Infer Shape according to memory, if current input shapes are consistent
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 7d73155ac067da4bfd112661d9061c008c1ccef1..7c2df12b17bdae80586a94caa8681271cfb7d409 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -111,18 +111,23 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kCUDA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kCUDA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
               KernelRegistryForTarget<TARGET(kCUDA),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kCUDA),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNHWC)> *,  //
+
               KernelRegistryForTarget<TARGET(kX86),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kX86),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
+
               KernelRegistryForTarget<TARGET(kHost),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
@@ -141,9 +146,7 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kHost),
                                       PRECISION(kInt64),
                                       DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
+
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 83df76f0230f666ec3857834e234afd921daa927..3d71b5d62e1a2d25202d34461affc78bd27f4852 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -71,12 +71,17 @@ class Optimizer {
            "identity_scale_eliminate_pass",               //
            "elementwise_mul_constant_eliminate_pass",     //
            "lite_sequence_pool_concat_fuse_pass",         //
+           "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
     (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
 #endif
            "__xpu__resnet_fuse_pass",
            "__xpu__multi_encoder_fuse_pass",
+           "__xpu__embedding_with_eltwise_add_fuse_pass",
+           "__xpu__fc_fuse_pass",
+           "identity_dropout_eliminate_pass",         // should be placed after
+                                                      // xpu fusion
            "quantized_op_attributes_inference_pass",  // Only for fully
                                                       // quantized model, infer
                                                       // the output scale and
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index ee581bf5e126f07fcdb1edeb9ab5b570df0c2ade..0eebf6a61016a3b399b7a7d4de26a4303f741440 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
+#include "lite/fluid/float16.h"
 
 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_image_converter.h"
@@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
   return true;
 }
 
+static bool write_precision_summary_tofile(const std::string& string,
+                                           const std::string& log_dir = "") {
+  if (log_dir == "") {
+    LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
+              << log_dir;
+    return false;
+  }
+  FILE* fp = fopen(log_dir.c_str(), "a");
+  if (fp == nullptr) {
+    LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
+    return false;
+  } else {
+    fprintf(fp, "%s\n", string.c_str());
+  }
+  fclose(fp);
+  return true;
+}
+
 class PrecisionProfiler {
  public:
   // TODO(ysh329): need to remove `explicit PrecisionProfiler`
@@ -67,7 +86,7 @@ class PrecisionProfiler {
     using std::left;
     using std::fixed;
     STL::stringstream ss;
-    ss << "========================================= "
+    ss << "\n\n========================================= "
        << "Detailed Precision Profiler Summary "
        << "=========================================" << std::endl;
     ss << setw(45) << left << "operator:(kernel_info)"
@@ -77,6 +96,13 @@ class PrecisionProfiler {
        << " " << setw(15) << left << "std_deviation"
        << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
 
+    // write to file with path: `log_dir`
+    if (log_dir_ != "") {
+      FILE* fp = fopen(log_dir_.c_str(), "a");
+      std::string header_str{ss.str()};
+      fprintf(fp, "%s\n", header_str.c_str());
+      fclose(fp);
+    }
     return ss.str();
   }
 
@@ -194,6 +220,7 @@ class PrecisionProfiler {
       }
 #ifdef LITE_WITH_OPENCL
     } else if (target_type == TARGET(kOpenCL)) {
+      CLRuntime::Global()->command_queue().finish();
       switch (layout_type) {
         case DATALAYOUT(kImageDefault): {
           paddle::lite::CLImageConverterDefault default_convertor;
@@ -360,8 +387,12 @@ class PrecisionProfiler {
         }
       }
     }
+    write_precision_summary_tofile(ss.str(), log_dir_);
     return ss.str();
   }
+
+ private:
+  std::string log_dir_{"/storage/emulated/0/precision.log"};
 };
 
 }  // namespace profile
diff --git a/lite/core/scope.cc b/lite/core/scope.cc
index 775652e2a0d3c962c17dc796ef5f1d381411fa50..d87360a1da8215332c71739bbfa2660977f4f74c 100644
--- a/lite/core/scope.cc
+++ b/lite/core/scope.cc
@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
   return nullptr;
 }
 
+// AttributeVarNames will get persistive attribute names stored in parent scope
+std::vector<std::string> Scope::AttributeVarNames() const {
+  std::vector<std::string> resulted_keys;
+  const Scope *cur_scope = this;
+  while (cur_scope->parent()) {
+    cur_scope = cur_scope->parent();
+    auto keys = cur_scope->LocalVarNames();
+    resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
+  }
+  // remove feed and fetch
+  std::vector<std::string> skiped_vars = {"feed", "fetch"};
+  for (int i = 0; i < skiped_vars.size(); i++) {
+    auto iter =
+        std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    while (iter != resulted_keys.end()) {
+      resulted_keys.erase(iter);
+      iter =
+          std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    }
+  }
+  return resulted_keys;
+}
+
 std::vector<std::string> Scope::LocalVarNames() const {
   std::vector<std::string> keys;
   for (const auto &item : vars_) {
diff --git a/lite/core/scope.h b/lite/core/scope.h
index 2593c365224a0564caa27cf10eee1f917b90c342..aa3a8a1bfb7f4bf1cc00b548c0b0962ce8d73663 100644
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -45,6 +45,8 @@ class Scope final {
 
   const Scope* parent() const { return parent_; }
 
+  // Get attribute params stored in parent scopes.
+  std::vector<std::string> AttributeVarNames() const;
   // Following the legacy scope interface.
   std::vector<std::string> LocalVarNames() const;
 
diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt
index e27548b4e56ce03098c5c82b3eee49add62cc0a4..f057a1f189fdb92ff33f00d5ceacc83f7fc28c5d 100644
--- a/lite/demo/cxx/cuda_demo/CMakeLists.txt
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
@@ -1,20 +1,24 @@
-project(demo CXX C)
 cmake_minimum_required(VERSION 2.8)
+project(demo CXX C)
+
+add_definitions(-DLITE_WITH_CUDA)
 
 set(TARGET demo)
 set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
 
-set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
-set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
+set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx")
+set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
 
-include_directories("${LITE_LIB}/include")
-link_directories("${LITE_LIB}/lib")
-link_directories("${PROTOBUF_LIB}/lib")
+include_directories("${LITE_ROOT}/include")
+link_directories("${LITE_ROOT}/lib")
+link_directories("${PROTOBUF_ROOT}/lib")
+# cuda lib
+link_directories("/usr/local/cuda/lib64/")
 
 add_executable(${TARGET} ${TARGET}.cc)
 
-set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
+set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so)
 set(DEPS ${DEPS} protobuf-lite)
-set(DEPS ${DEPS} "-lrt -lpthread -ldl")
+set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart")
 
 target_link_libraries(${TARGET} ${DEPS})
diff --git a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..fe808ef7ec571bb73b2aa7c4888ba447a35ad8bd
--- /dev/null
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
@@ -0,0 +1,97 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
diff --git a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..f87143a92043e2c011c572bac78a9eb420bacaf1
--- /dev/null
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
@@ -0,0 +1,97 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
diff --git a/lite/demo/cxx/test_libs/classification_full.cc b/lite/demo/cxx/test_libs/classification_full.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2515d6abd89b6714ff731bed28f4e8e8c5c3dd75
--- /dev/null
+++ b/lite/demo/cxx/test_libs/classification_full.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
+              "",
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
+              "",
+              "the filename of param file, set param_file when the model is "
+              "combined formate.");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor");
+
+// Optimize model for ARM CPU.
+// If the model is not combined, set model_filename and params_filename as empty
+void OptModel(const std::string& load_model_dir,
+              const std::string& model_filename,
+              const std::string& params_filename,
+              const std::string& save_model_path) {
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  if (!model_filename.empty() && !params_filename.empty()) {
+    config.set_model_file(load_model_dir + "/" + model_filename);
+    config.set_param_file(load_model_dir + "/" + params_filename);
+  }
+  std::vector<paddle::lite_api::Place> vaild_places = {
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
+  };
+  config.set_valid_places(vaild_places);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::string cmd_str = "rm -rf " + save_model_path;
+  int ret = system(cmd_str.c_str());
+  if (ret == 0) {
+    std::cout << "Delete old optimized model " << save_model_path << std::endl;
+  }
+  predictor->SaveOptimizedModel(save_model_path,
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  std::cout << "Load model from " << load_model_dir << std::endl;
+  std::cout << "Save optimized model to " << save_model_path << std::endl;
+}
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test.---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test.---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--model_dir: the path of not optimized model \n"
+           "--model_filename: the model filename of not optimized model \n"
+           "--param_filename: the param filename of not optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+
+  const int height = 224;
+  const int width = 224;
+  std::string model_dir = FLAGS_model_dir;
+  if (model_dir.back() == '/') {
+    model_dir.pop_back();
+  }
+  std::string optimized_model_path = model_dir + "_opt2";
+  OptModel(FLAGS_model_dir,
+           FLAGS_model_filename,
+           FLAGS_param_filename,
+           optimized_model_path);
+  std::string run_model_path = optimized_model_path + ".nb";
+
+  // Run test
+  Run(run_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
diff --git a/lite/demo/cxx/test_libs/classification_light.cc b/lite/demo/cxx/test_libs/classification_light.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91d981e1fc991bef48da97847eddee9e724fe654
--- /dev/null
+++ b/lite/demo/cxx/test_libs/classification_light.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(optimized_model_path, "", "the path of optimized model");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor");
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_optimized_model_path.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--optimized_model_path: the path of optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+
+  const int height = 224;
+  const int width = 224;
+  // Run test
+  Run(FLAGS_optimized_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
diff --git a/lite/demo/cxx/test_libs/prepare.sh b/lite/demo/cxx/test_libs/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c8baf3f1afb7c785b0fb1621910739821b370b0
--- /dev/null
+++ b/lite/demo/cxx/test_libs/prepare.sh
@@ -0,0 +1,30 @@
+make clean
+make all -j
+
+gf=test_lite_lib_files
+if [ -d ${gf} ];then
+    rm -rf ${gf}
+fi
+mkdir ${gf}
+
+mv classification_full_shared ${gf}
+mv classification_full_static ${gf}
+mv classification_light_shared ${gf}
+mv classification_light_static ${gf}
+mv yolov3_full_shared ${gf}
+mv yolov3_full_static ${gf}
+mv yolov3_light_shared ${gf}
+mv yolov3_light_static ${gf}
+cp run.sh ${gf}
+
+make clean
+
+cp -r ../../../cxx/ ${gf}
+mv ${gf}/cxx ${gf}/lite
+
+if [ ! -f "test_libs_models_imgs.tgz" ];then
+    wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz
+fi
+tar zxvf test_libs_models_imgs.tgz
+mv test_libs_models_imgs ${gf}
+mv ${gf}/test_libs_models_imgs ${gf}/models_imgs
diff --git a/lite/demo/cxx/test_libs/run.sh b/lite/demo/cxx/test_libs/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ead4c0adfaff1c3b44b9494d45277e365f6ff763
--- /dev/null
+++ b/lite/demo/cxx/test_libs/run.sh
@@ -0,0 +1,75 @@
+export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH}
+
+# mobilenetv1
+
+./classification_light_shared \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+./classification_light_static \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+./classification_full_static \
+    --model_dir=models_imgs/models/mobilenetv1 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+./classification_full_shared \
+    --model_dir=models_imgs/models/mobilenetv1 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+# mobilenetv2
+
+./classification_light_shared \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+./classification_light_static \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+./classification_full_static \
+    --model_dir=models_imgs/models/mobilenetv2 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+./classification_full_shared \
+    --model_dir=models_imgs/models/mobilenetv2 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+# yolov3
+
+./yolov3_light_shared \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb  \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+
+./yolov3_light_static \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+
+./yolov3_full_static \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1 \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+
+./yolov3_full_shared \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1 \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
diff --git a/lite/demo/cxx/test_libs/test_helper.cc b/lite/demo/cxx/test_libs/test_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..450579c90d66f952f32ac70353f4867cee94e007
--- /dev/null
+++ b/lite/demo/cxx/test_libs/test_helper.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test_helper.h"  // NOLINT
+
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+
+double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str) {
+  std::vector<int64_t> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    int num = atoi(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+
+std::vector<double> GetDoubleNumsFromStr(const std::string& str) {
+  std::vector<double> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    double num = atof(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) / scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) / scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) / scale[2];
+  }
+}
+
+// Process img and set it as input
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dest_data,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, dest_data, width * height, means, scales);
+}
diff --git a/lite/demo/cxx/test_libs/test_helper.h b/lite/demo/cxx/test_libs/test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ef42af571925fd556538747cd21b72e925329bc
--- /dev/null
+++ b/lite/demo/cxx/test_libs/test_helper.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+
+double GetCurrentUS();
+
+int64_t ShapeProduction(const std::vector<int64_t>& shape);
+
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str);
+std::vector<double> GetDoubleNumsFromStr(const std::string& str);
+
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale);
+
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dst_data,
+                 float* means,
+                 float* scales);
diff --git a/lite/demo/cxx/test_libs/yolov3_full.cc b/lite/demo/cxx/test_libs/yolov3_full.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0e69f9042f6ebf8ed68626b52889fac59f73c18
--- /dev/null
+++ b/lite/demo/cxx/test_libs/yolov3_full.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
+              "",
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
+              "",
+              "the filename of param file, set param_file when the model is "
+              "combined formate.");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_string(out_values,
+              "",
+              "The output values, separated by colon and comma");
+DEFINE_double(threshold,
+              1e-3,
+              "If the output value diff is smaller than threshold, pass test");
+
+void OptModel(const std::string& load_model_dir,
+              const std::string& model_filename,
+              const std::string& params_filename,
+              const std::string& save_model_path) {
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  if (!model_filename.empty() && !params_filename.empty()) {
+    config.set_model_file(load_model_dir + "/" + model_filename);
+    config.set_param_file(load_model_dir + "/" + params_filename);
+  }
+  std::vector<paddle::lite_api::Place> vaild_places = {
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
+  };
+  config.set_valid_places(vaild_places);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::string cmd_str = "rm -rf " + save_model_path;
+  int ret = system(cmd_str.c_str());
+  if (ret == 0) {
+    std::cout << "Delete old optimized model " << save_model_path << std::endl;
+  }
+  predictor->SaveOptimizedModel(save_model_path,
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  std::cout << "Load model from " << load_model_dir << std::endl;
+  std::cout << "Save optimized model to " << save_model_path << std::endl;
+}
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const std::vector<double>& out_values,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+  auto shape_tensor = predictor->GetInput(1);
+  shape_tensor->Resize({1, 2});
+  auto* shape_data = shape_tensor->mutable_data<int>();
+  shape_data[0] = height;
+  shape_data[1] = width;
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  bool is_pass = true;
+  for (int i = 0; i < output_num && i < out_values.size(); i++) {
+    std::cout << "id:" << i << " out_data:" << out_data[i]
+              << " gt_data:" << out_values[i] << std::endl;
+    if (fabs(out_data[i] - out_values[i]) > threshold) {
+      is_pass = false;
+    }
+  }
+  if (is_pass) {
+    std::cout << "----------Pass test---------- \n\n";
+  } else {
+    std::cout << "----------Fail test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--model_dir: the path of not optimized model \n"
+           "--model_filename: the model filename of not optimized model \n"
+           "--param_filename: the param filename of not optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_values: The output values, separated by colon and comma.\n"
+           "--threshold: If the out value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n";
+    exit(1);
+  }
+
+  const int height = 608;
+  const int width = 608;
+  std::vector<double> out_values = GetDoubleNumsFromStr(FLAGS_out_values);
+
+  std::string model_dir = FLAGS_model_dir;
+  if (model_dir.back() == '/') {
+    model_dir.pop_back();
+  }
+  std::string optimized_model_path = model_dir + "_opt2";
+  OptModel(FLAGS_model_dir,
+           FLAGS_model_filename,
+           FLAGS_param_filename,
+           optimized_model_path);
+  std::string run_model_path = optimized_model_path + ".nb";
+
+  // Run test
+  Run(run_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      out_values,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
diff --git a/lite/demo/cxx/test_libs/yolov3_light.cc b/lite/demo/cxx/test_libs/yolov3_light.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b31151c8fc2384ec24f2f908d156f4200db279d7
--- /dev/null
+++ b/lite/demo/cxx/test_libs/yolov3_light.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(optimized_model_path, "", "the path of the optimized model");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_string(out_values,
+              "",
+              "The output values, separated by colon and comma");
+DEFINE_double(threshold,
+              1e-3,
+              "If the output value diff is smaller than threshold, pass test");
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const std::vector<double>& out_values,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+  auto shape_tensor = predictor->GetInput(1);
+  shape_tensor->Resize({1, 2});
+  auto* shape_data = shape_tensor->mutable_data<int>();
+  shape_data[0] = height;
+  shape_data[1] = width;
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  bool is_pass = true;
+  for (int i = 0; i < output_num && i < out_values.size(); i++) {
+    std::cout << "id:" << i << " out_data:" << out_data[i]
+              << " gt_data:" << out_values[i] << std::endl;
+    if (fabs(out_data[i] - out_values[i]) > threshold) {
+      is_pass = false;
+    }
+  }
+  if (is_pass) {
+    std::cout << "----------Pass test---------- \n\n";
+  } else {
+    std::cout << "----------Fail test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_optimized_model_path.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--optimized_model_path: the path of optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_values: The output values, separated by colon and comma.\n"
+           "--threshold: If the out value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n";
+    exit(1);
+  }
+
+  const int height = 608;
+  const int width = 608;
+  std::vector<double> out_values = GetDoubleNumsFromStr(FLAGS_out_values);
+
+  // Run test
+  Run(FLAGS_optimized_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      out_values,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index aa3a52e8ad1223451de06e820da7e1febb43b879..9670149114d0f7cc953129b83215c0e8b7caa56a 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -56,7 +56,6 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k
 add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 ## 3. extra kernels
 add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -88,13 +87,10 @@ add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps
 add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc
index e0d4ae3f13c6b8bf2364ab5d50ec45bb245377c6..bbd17d98c6ab3096039a5741dd236467ab577f27 100644
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
@@ -114,14 +114,14 @@ struct BeamSearchDecoder {
     lod.push_back(source_level_lod);
     lod.push_back(sentence_level_lod);
 
-    *(id_tensor->mutable_lod()) = lod;
+    id_tensor->set_lod(lod);
 
     id_tensor->Resize({static_cast<int64_t>(id_data.size())});
     auto id_ptr = id_tensor->mutable_data<int64_t>();
     TargetCopy(
         TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(int64_t));
 
-    *(score_tensor->mutable_lod()) = lod;
+    score_tensor->set_lod(lod);
     score_tensor->Resize({static_cast<int64_t>(score_data.size())});
     auto score_ptr = score_tensor->mutable_data<T>();
     TargetCopy(TARGET(kARM),
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index fb8529af5a0fa4b92b761e1cd8780859138c2059..2a545e70691f030a3a1e3f2a9a9822f5cd8b85b9 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -72,7 +72,7 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
     impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
     // VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && ks_equal &&
-             no_dilation && pads_all_equal) {
+             no_dilation) {
     // TODO(MyPandaShaoxiang): winograd conv support any pad
     impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
     // VLOG(3) << "invoking winograd conv";
@@ -109,6 +109,8 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
   int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  int hin = param.x->dims()[2];
+  int win = param.x->dims()[3];
   bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
@@ -116,13 +118,12 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
   bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
   bool flag_dw_5x5 = pads_all_equal && (kw == 5 && (sw == 1 || sw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
-
   if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
-             kps_equal && no_dilation) {
+             ic * oc < 4 * hin * win && kps_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run DirectConv Int8";
   } else {
@@ -154,6 +155,8 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  int hin = param.x->dims()[2];
+  int win = param.x->dims()[3];
   bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
@@ -167,7 +170,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
-             kps_equal && no_dilation) {
+             ic * oc < 4 * hin * win && kps_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run DirectConv Int8";
   } else {
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index e433a3f4bb4a7aa553fbb1193ff82779d9af3242..d0880e51de1eff4763c63d2d3fa4bc74cafc859e 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -45,12 +45,14 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   int ow = o_dims[3];
   int tile_block = 8;
   auto pad = *(param.paddings);
-  int pad_h = pad[0];
-  int pad_w = pad[2];
+  int pad_h0 = pad[0];
+  int pad_h1 = pad[1];
+  int pad_w0 = pad[2];
+  int pad_w1 = pad[3];
   int oc_pad = (oc + 3) / 4 * 4;
   int ic_pad = (ic + 3) / 4 * 4;
   const int new_input_size =
-      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+      (ic + 3) / 4 * 4 * (ih + pad_h0 + pad_h1) * (iw + pad_w0 + pad_w1);
   const int temp_size =
       (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
        8 * wino_iw * wino_iw) *
diff --git a/lite/kernels/arm/logical_compute.cc b/lite/kernels/arm/logical_compute.cc
deleted file mode 100644
index 1e47329d8ff65f3d036fd4a8a653cfe5cdc80a3a..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/logical_compute.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/logical_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define LOGICAL_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T& a, const T& b) const { return a op b; } \
-  };
-
-LOGICAL_FUNCTOR(LogicalAnd, &&);
-LOGICAL_FUNCTOR(LogicalOr, ||);
-
-template <typename T>
-struct _LogicalXorFunctor {
-  inline bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
-};
-
-template <typename T>
-struct _LogicalNotFunctor {
-  inline bool operator()(const T& a) const { return !a; }
-};
-
-// template<typename Functor>
-template <template <typename T> class Functor>
-void BinaryLogicalCompute<Functor>::PrepareForRun() {}
-
-template <template <typename T> class Functor>
-// template<typename Functor>
-void BinaryLogicalCompute<Functor>::Run() {
-  auto& param = this->Param<operators::LogicalParam>();
-  const size_t count = param.X->numel();
-  bool* z = param.Out->template mutable_data<bool>();
-  const bool* x = param.X->template data<bool>();
-  const bool* y = param.Y->template data<bool>();
-  using LogicalFunctor = Functor<bool>;
-  for (int i = 0; i < count; ++i) {
-    z[i] = LogicalFunctor()(x[i], y[i]);
-  }
-}
-
-template <template <typename> class Functor>
-void UnaryLogicalCompute<Functor>::PrepareForRun() {}
-
-template <template <typename> class Functor>
-void UnaryLogicalCompute<Functor>::Run() {
-  auto& param = this->Param<operators::LogicalParam>();
-  const size_t count = param.X->numel();
-  bool* z = param.Out->template mutable_data<bool>();
-  const auto x = param.X->template data<bool>();
-  using LogicalFunctor = Functor<bool>;
-  for (int i = 0; i < count; ++i) {
-    z[i] = LogicalFunctor()(x[i]);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(logical_xor,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BinaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalXorFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(logical_and,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BinaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalAndFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(logical_or,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BinaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalOrFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(logical_not,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::UnaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalNotFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc
index 71192d7b937116966a5b95a7620805065fdd152e..c6f91f209b42ea6f2f99a7741e90c0eb9103952b 100644
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
@@ -31,7 +31,18 @@ void ScaleCompute<T, PType>::Run() {
   if (!param.bias_after_scale) {
     bias *= scale;
   }
-  lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
+  T alpha = param.alpha;
+  if (param.activation_type == "") {  // no act
+    lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
+  } else if (param.activation_type == "relu") {  // do relu
+    lite::arm::math::scale_relu<T>(x_data, output_data, num, scale, bias);
+  } else if (param.activation_type == "relu6") {  // do relu6
+    lite::arm::math::scale_relu6<T>(
+        x_data, output_data, num, scale, bias, alpha);
+  } else if (param.activation_type == "leaky_relu") {  // do leaky_relu
+    lite::arm::math::scale_leaky_relu<T>(
+        x_data, output_data, num, scale, bias, alpha);
+  }
   if (!param.x->lod().empty()) {
     param.output->set_lod(param.x->lod());
   }
diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt
index 5e45701150a57a96dbb077ac1c812ed47a6ec1e0..0cb3db81869d9dafc782040ad6934cae29c76231 100644
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -37,6 +37,7 @@ lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_
 lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_density_prior_box_op_bm SRCS density_prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
 
+
 set(bm_subgraph_bridges
         subgraph_bridge_registry
         subgraph_bridge_engine
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 8505a1a1405d7569ad3161812fc552f8625499ef..a0085e6d6c5e65667e96393c42a1608c8dd24d0c 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -8,4 +8,8 @@ add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kerne
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(compare_compute_host Host extra SRCS compare_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(logical_compute_host Host extra SRCS logical_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/host/assign_compute.cc b/lite/kernels/host/assign_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e496ffbd1d9a6362d730117be949cbdab83ec62a
--- /dev/null
+++ b/lite/kernels/host/assign_compute.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/assign_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+void AssignCompute::Run() {
+  auto& param = Param<param_t>();
+  if (param.X != nullptr) {
+    param.Out->CopyDataFrom(*param.X);
+  } else if (param.X_array != nullptr) {
+    auto x_array = param.X_array;
+    auto out_array = param.Out_array;
+    out_array->resize(x_array->size());
+    for (size_t i = 0; i < x_array->size(); i++) {
+      out_array->at(i).CopyDataFrom(x_array->at(i));
+    }
+  } else {
+    LOG(FATAL) << "x or x_array of assign must be set.";
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    assign, kHost, kAny, kAny, paddle::lite::kernels::host::AssignCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/assign_compute.h b/lite/kernels/host/assign_compute.h
similarity index 84%
rename from lite/kernels/arm/assign_compute.h
rename to lite/kernels/host/assign_compute.h
index e144486b5970b4e4e82c58148e33ccc5b2d37ff4..01b8e5a4bc2b36699b0687a908c92160bca54c14 100644
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/host/assign_compute.h
@@ -15,14 +15,15 @@
 #pragma once
 #include <algorithm>
 #include "lite/core/kernel.h"
-#include "lite/operators/assign_op.h"
+#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class AssignCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class AssignCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
   using param_t = operators::AssignParam;
 
@@ -31,7 +32,7 @@ class AssignCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
   virtual ~AssignCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/host/compare_compute.cc b/lite/kernels/host/compare_compute.cc
index f44b3edcfcf8690e67d02daf2d05040b56c53296..b45cdc789ba18c6c5abb08dce73bce83990ee5ca 100644
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/host/compare_compute.h"
+#include <math.h>
 #include <vector>
 
 namespace paddle {
diff --git a/lite/kernels/host/logical_compute.cc b/lite/kernels/host/logical_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2030370b3071d00b2856bb30c15b2ce7d622b288
--- /dev/null
+++ b/lite/kernels/host/logical_compute.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/logical_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+#define LOGICAL_FUNCTOR(name, op)                                \
+  struct _##name##Functor {                                      \
+    inline bool operator()(const bool& a, const bool& b) const { \
+      return a op b;                                             \
+    }                                                            \
+  };
+
+LOGICAL_FUNCTOR(LogicalAnd, &&);
+LOGICAL_FUNCTOR(LogicalOr, ||);
+
+struct _LogicalXorFunctor {
+  inline bool operator()(const bool& a, const bool& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+
+struct _LogicalNotFunctor {
+  inline bool operator()(const bool& a) const { return !a; }
+};
+
+template <class Functor>
+// template<typename Functor>
+void BinaryLogicalCompute<Functor>::Run() {
+  auto& param = this->Param<operators::LogicalParam>();
+  const size_t count = param.X->numel();
+  bool* z = param.Out->template mutable_data<bool>();
+  const bool* x = param.X->template data<bool>();
+  const bool* y = param.Y->template data<bool>();
+  for (int i = 0; i < count; ++i) {
+    z[i] = Functor()(x[i], y[i]);
+  }
+}
+
+template <class Functor>
+void UnaryLogicalCompute<Functor>::Run() {
+  auto& param = this->Param<operators::LogicalParam>();
+  const size_t count = param.X->numel();
+  bool* z = param.Out->template mutable_data<bool>();
+  const auto x = param.X->template data<bool>();
+  for (int i = 0; i < count; ++i) {
+    z[i] = Functor()(x[i]);
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(logical_xor,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::BinaryLogicalCompute<
+                         paddle::lite::kernels::host::_LogicalXorFunctor>,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(logical_and,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::BinaryLogicalCompute<
+                         paddle::lite::kernels::host::_LogicalAndFunctor>,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(logical_or,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::BinaryLogicalCompute<
+                         paddle::lite::kernels::host::_LogicalOrFunctor>,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(logical_not,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::UnaryLogicalCompute<
+                         paddle::lite::kernels::host::_LogicalNotFunctor>,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kBool),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/logical_compute.h b/lite/kernels/host/logical_compute.h
similarity index 63%
rename from lite/kernels/arm/logical_compute.h
rename to lite/kernels/host/logical_compute.h
index fe7ef1e92d5c5ccf73cca9751aad3f0a248ab8c9..bfb61ffe92789b1e575b3d811fdc4243121c9496 100644
--- a/lite/kernels/arm/logical_compute.h
+++ b/lite/kernels/host/logical_compute.h
@@ -13,41 +13,33 @@
 // limitations under the License.
 
 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
-#include "lite/operators/logical_op.h"
+#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
-// template <typename Functor>
-template <template <typename> class Functor>
+namespace host {
+
+template <class Functor>
 class BinaryLogicalCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
-  using param_t = operators::LogicalParam;
-
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~BinaryLogicalCompute() {}
 };
 
-template <template <typename> class Functor>
-class UnaryLogicalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <class Functor>
+class UnaryLogicalCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
-  using param_t = operators::LogicalParam;
-
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~UnaryLogicalCompute() {}
 };
-}  // namespace arm
+
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/read_from_array_compute.cc b/lite/kernels/host/read_from_array_compute.cc
similarity index 57%
rename from lite/kernels/arm/read_from_array_compute.cc
rename to lite/kernels/host/read_from_array_compute.cc
index f2aff42f1c978d65f8faa96080ce7c07fc61cb43..7520fcb8b3cb4ca355d2cc816c59434bc5213ee9 100644
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ b/lite/kernels/host/read_from_array_compute.cc
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/read_from_array_compute.h"
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/host/read_from_array_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void ReadFromArrayCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<param_t>();
+  auto& param = this->Param<operators::ReadFromArrayParam>();
 
   CHECK_EQ(param.I->numel(), 1) << "I should have only one element";
   int id = param.I->data<int64_t>()[0];
@@ -33,18 +31,27 @@ void ReadFromArrayCompute::Run() {
   param.Out->CopyDataFrom((*param.X)[id]);
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(read_from_array,
-                     kARM,
+                     kHost,
                      kAny,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ReadFromArrayCompute,
+                     kAny,
+                     paddle::lite::kernels::host::ReadFromArrayCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("X",
+               {LiteType::GetTensorListTy(TARGET(kHost),
+                                          PRECISION(kAny),
+                                          DATALAYOUT(kAny))})
+    .BindInput("I",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/read_from_array_compute.h b/lite/kernels/host/read_from_array_compute.h
similarity index 79%
rename from lite/kernels/arm/read_from_array_compute.h
rename to lite/kernels/host/read_from_array_compute.h
index b44f46792a3e517bf0ab0d9fa4f6c49b66700dcf..66ba548ff41d07a092914c8abb4ce9d9e5f03afa 100644
--- a/lite/kernels/arm/read_from_array_compute.h
+++ b/lite/kernels/host/read_from_array_compute.h
@@ -13,20 +13,17 @@
 // limitations under the License.
 
 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class ReadFromArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class ReadFromArrayCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
-  using param_t = operators::ReadFromArrayParam;
-
   void Run() override;
 
   ~ReadFromArrayCompute() {}
@@ -34,7 +31,7 @@ class ReadFromArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  private:
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/write_to_array_compute.cc b/lite/kernels/host/write_to_array_compute.cc
similarity index 61%
rename from lite/kernels/arm/write_to_array_compute.cc
rename to lite/kernels/host/write_to_array_compute.cc
index 6b82f991268bdd10d7c32f36e27ca7532384a799..682805e6028b62c016b20fe59b8b305d0a2315d9 100644
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/host/write_to_array_compute.cc
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/write_to_array_compute.h"
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/host/write_to_array_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void WriteToArrayCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
   auto& param = this->template Param<operators::WriteToArrayParam>();
   CHECK_EQ(param.I->numel(), 1) << "input2 should have only one element";
 
@@ -32,19 +30,27 @@ void WriteToArrayCompute::Run() {
   param.Out->at(id).CopyDataFrom(*param.X);
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(write_to_array,
-                     kARM,
+                     kHost,
                      kAny,
-                     kNCHW,
-                     paddle::lite::kernels::arm::WriteToArrayCompute,
+                     kAny,
+                     paddle::lite::kernels::host::WriteToArrayCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindInput("I",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
-                {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
+                {LiteType::GetTensorListTy(TARGET(kHost),
+                                           PRECISION(kAny),
+                                           DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/write_to_array_compute.h b/lite/kernels/host/write_to_array_compute.h
similarity index 83%
rename from lite/kernels/arm/write_to_array_compute.h
rename to lite/kernels/host/write_to_array_compute.h
index 960c53d4ef61269080df520574cb21127ebbaaf3..dcb1433d9bb1d94dc2049ec1cec8a03b1cafb1a4 100644
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/host/write_to_array_compute.h
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class WriteToArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class WriteToArrayCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
   void Run() override;
 
@@ -32,7 +31,7 @@ class WriteToArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  private:
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index d9fae3d48efb1eab2681338b02afa2fee65750b6..9d5ffa3d2b4abad559a4a0772248aaf25a12cf53 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -111,7 +111,8 @@ lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
 #add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
+# NOTE(ysh329): use fc as `mul`, and mul is not used.
+#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(fusion_elementwise_add_activation_opencl
 #           OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
@@ -147,8 +148,8 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
              DEPS fc_opencl op_registry program context)
 
-lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
-             DEPS mul_opencl op_registry program context)
+#lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
+#             DEPS mul_opencl op_registry program context)
 
 #lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
 #             DEPS elementwise_add_opencl op_registry program context)
diff --git a/lite/kernels/opencl/activation_buffer_compute.cc b/lite/kernels/opencl/activation_buffer_compute.cc
index 7ca2c663d1cfd360a72a176182adcaa9c2e9b168..1e341952c43115e8db62c3398455ac8cbef83724 100644
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -62,23 +62,21 @@ class ReluCompute
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{count};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
   }
 
  private:
   std::string kernel_func_name_{"relu"};
   std::string build_options_{"-DCL_DTYPE_float -DRELU"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 class SigmoidCompute
@@ -121,23 +119,21 @@ class SigmoidCompute
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{count};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
   }
 
  private:
   std::string kernel_func_name_{"sigmoid"};
   std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/activation_buffer_compute_test.cc b/lite/kernels/opencl/activation_buffer_compute_test.cc
index e0a49c349a96a0afab0b82d23f4aa04d5db6baa8..817dda162800c18fddbb6ee8d5a6871de2e4fcd2 100644
--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
@@ -85,16 +85,9 @@ TEST(opencl_relu_buffer, compute) {
 
   kernel->Launch();
 
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
   auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+
+  CLRuntime::Global()->command_queue().finish();
 
   // run compute ref and check
   std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
@@ -145,16 +138,9 @@ TEST(opencl_sigmoid_buffer, compute) {
 
   kernel->Launch();
 
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
   auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+
+  CLRuntime::Global()->command_queue().finish();
 
   // run compute ref and check
   std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
index 3b663cbd7d29da19122f2273c802bf47b4e1ebac..944a59ce15eea34f1e2045dc1093c971adc8483a 100644
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -147,16 +147,15 @@ class ActivationComputeImageDefault
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size_,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  private:
@@ -175,7 +174,6 @@ class ActivationComputeImageDefault
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl
 }  // namespace kernels
diff --git a/lite/kernels/opencl/activation_image_compute_test.cc b/lite/kernels/opencl/activation_image_compute_test.cc
index 2f30ec6743fd488fc88f0b9f9d6544b3ca7642bf..ad0d83a8e34e6e9218602dac4dade9ce9afdbc8b 100644
--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -234,19 +234,9 @@ TEST(act_image2d_fp16, compute) {
                 img_to_buf_kernel->Launch();
 
                 // wait for opencl
-                auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
                 auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-                auto it = wait_list->find(out_ptr);
 
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto &event = *(it->second);
-                  event.wait();
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
+                CLRuntime::Global()->command_queue().finish();
 
                 // compute ref cpu
                 act_compute_ref<float>(
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
index d5143da9bd32941e7be5e4d46ca95261e83a9a90..a078301883b9fc1de4f82e7d23570f2a108a87d4 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -142,16 +142,14 @@ class BilinearInterpImageCompute
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
@@ -163,7 +161,6 @@ class BilinearInterpImageCompute
   std::string kernel_func_name_{"bilinear_interp"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
index dec202ef55d797ce270ef46c6f80cc8a3474936f..5bd1485de31283506ea9f1b768558b49271d6be7 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
@@ -176,7 +176,6 @@ TEST(bilinear_interp_image2d, compute) {
                       input_v.data(), x_image_data.data(), in_dim);
                   auto* x_image = x.mutable_data<half_t, cl::Image2D>(
                       x_image_shape[0], x_image_shape[1], x_image_data.data());
-                  // LOG(INFO) << "x_image:" << x_image;
 
                   DDim out_image_shape =
                       default_converter->InitImageDimInfoWith(out_dim);
@@ -184,21 +183,9 @@ TEST(bilinear_interp_image2d, compute) {
                             << out_image_shape[1];
                   auto* out_image = out.mutable_data<half_t, cl::Image2D>(
                       out_image_shape[0], out_image_shape[1]);
-                  // LOG(INFO) << "out_image:" << out_image;
-                  kernel->Launch();
 
-                  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                  auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
-                  auto it = wait_list->find(out_ptr);
-                  if (it != wait_list->end()) {
-                    VLOG(4) << "--- Find the sync event for the target cl "
-                               "tensor. ---";
-                    auto& event = *(it->second);
-                    event.wait();
-                  } else {
-                    LOG(FATAL) << "Could not find the sync event for the "
-                                  "target cl tensor.";
-                  }
+                  kernel->Launch();
+                  CLRuntime::Global()->command_queue().finish();
 
                   std::unique_ptr<float[]> out_ref(
                       new float[out_dim.production()]);
diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc
index d44610faaa4107031a7d225bbeaaf38144f52a17..00509f5aacbcd531fe338729c8bb2c6664fba495 100644
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
@@ -41,14 +41,15 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
         boxcoder_param_->box_normalized == true) {
       kernel_func_name_ = "decode_center_size";
     } else {
-      printf("This code_type %s doesn't support \n",
-             boxcoder_param_->code_type.c_str());
-      return;
+      LOG(FATAL) << "This code_type " << boxcoder_param_->code_type
+                 << " doesn't support";
     }
     CHECK(context.cl_context() != nullptr);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/box_coder_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/box_coder_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -81,7 +82,7 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
       auto& context = ctx_->As<OpenCLContext>();
       CHECK(context.cl_context() != nullptr);
       STL::stringstream kernel_key;
-      kernel_key << kernel_func_name_ << build_options_;
+      kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
       auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
       auto default_work_size =
@@ -120,16 +121,14 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
           cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
                       static_cast<cl::size_type>(default_work_size[2])};
 
-      event_ = std::shared_ptr<cl::Event>(new cl::Event);
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
           global_work_size,
           cl::NullRange,
           nullptr,
-          event_.get());
+          nullptr);
       CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
 
 #ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
@@ -142,7 +141,7 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
   param_t* boxcoder_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{nullptr};
+  std::string time_stamp_{GetTimeStamp()};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/box_coder_image_compute_test.cc b/lite/kernels/opencl/box_coder_image_compute_test.cc
index ab37a8b015a80c0389bd6f62bb07c70c0d14a74a..75f1c852c1f4efd3ef4abf90751722aedb91db1a 100644
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
@@ -216,18 +216,7 @@ TEST(box_coder_image2d, compute) {
                 out_image_shape[0], out_image_shape[1]);
             kernel->Launch();
 
-            auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-            auto* out_ptr = param.proposals->data<half_t, cl::Image2D>();
-            auto it = wait_list->find(out_ptr);
-            if (it != wait_list->end()) {
-              VLOG(4) << "--- Find the sync event for the target cl "
-                         "tensor. ---";
-              auto& event = *(it->second);
-              event.wait();
-            } else {
-              LOG(FATAL) << "Could not find the sync event for the "
-                            "target cl tensor.";
-            }
+            CLRuntime::Global()->command_queue().finish();
 
             lite::Tensor out_ref_tensor;
             out_ref_tensor.Resize(out_dim);
diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc
index aebffe3a5764f7207b47b938ee724424f648a987..5b7c745f31160e8d561ea07546953827fae4cd96 100644
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -123,16 +123,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, total1);
       CL_CHECK_FATAL(status);
-      event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
           global_work_size,
           cl::NullRange,
           nullptr,
-          event_.get());
+          nullptr);
       CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
     } else {
       auto start = 0;
       for (int i = 0; i < inputs.size(); i++) {
@@ -157,16 +156,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
         CL_CHECK_FATAL(status);
         status = kernel.setArg(++arg_idx, total0);
         CL_CHECK_FATAL(status);
-        event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
         status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
             kernel,
             cl::NullRange,
             global_work_size,
             cl::NullRange,
             nullptr,
-            event_.get());
+            nullptr);
         CL_CHECK_FATAL(status);
-        context.cl_wait_list()->emplace(out_buf, event_);
         start += size;
       }
     }
@@ -182,7 +180,6 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/concat_buffer_compute_test.cc b/lite/kernels/opencl/concat_buffer_compute_test.cc
index b8dff344f71398f749d9f051b0b991e37b77acb7..57621d4a3958f7f79fea1f4e72306ea1e323ba7b 100644
--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
@@ -142,16 +142,7 @@ TEST(opencl_concat_buffer, compute) {
   kernel->SetContext(std::move(concat_context));
   kernel->Launch();
 
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   // run compute ref and check
   auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index 9d248d0718ee468cbfca032c5270853d78ad8019..a9701841db795b4ec45eca15284c90f686abf0a1 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -187,16 +187,15 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, width_);
       CL_CHECK_FATAL(status);
-      event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
       status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
           kernel,
           cl::NullRange,
           global_work_size,
           cl::NullRange,
           nullptr,
-          event_.get());
+          nullptr);
       CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
     } else {
       auto start = 0;
       for (int i = 0; i < inputs.size(); i++) {
@@ -231,16 +230,15 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
         status = kernel.setArg(++arg_idx, width_);
         CL_CHECK_FATAL(status);
         CL_CHECK_FATAL(status);
-        event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
         status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
             kernel,
             cl::NullRange,
             global_work_size,
             cl::NullRange,
             nullptr,
-            event_.get());
+            nullptr);
         CL_CHECK_FATAL(status);
-        context.cl_wait_list()->emplace(out_buf, event_);
         start += inputs[i]->dims()[axis_];
       }
     }
@@ -256,7 +254,6 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/concat_image_compute_test.cc b/lite/kernels/opencl/concat_image_compute_test.cc
index 38958acfbccecdf1d8e96a2d571e0804e172d049..d7e87daa12afdac345cb794aa3367a9442f41572 100644
--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
@@ -245,20 +245,7 @@ TEST(concat_image2d, compute) {
             LOG(INFO) << "run kernel: img_to_buf_kernel";
             img_to_buf_kernel->Launch();
 
-            // wait for opencl
-            auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-            auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-            auto it = wait_list->find(out_ptr);
-
-            if (it != wait_list->end()) {
-              VLOG(4) << "--- Find the sync event for the target cl "
-                         "tensor. ---";
-              auto &event = *(it->second);
-              event.wait();
-            } else {
-              LOG(FATAL) << "Could not find the sync event for the target "
-                            "cl tensor.";
-            }
+            CLRuntime::Global()->command_queue().finish();
 
             // compute ref cp_u
             std::vector<const float *> ins_ptr;
diff --git a/lite/kernels/opencl/conv_buffer_compute.cc b/lite/kernels/opencl/conv_buffer_compute.cc
index 51e3eab352ef92ae8547e52691afcc8c5889f446..bc919ca5cbf45f7533ada5bcff5179f2795b8bc0 100644
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -205,7 +205,7 @@ void ConvCompute::GemmlikeConv2d() {
     CL_CHECK_FATAL(status);
 
     auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         img2col_kernel,
         cl::NullRange,
@@ -301,17 +301,14 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
   status = kernel.setArg(++arg_idx, batch_size);
   CL_CHECK_FATAL(status);
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size,
       local_work_size,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-
-  context.cl_wait_list()->emplace(output_d, event_);
 }
 
 void ConvCompute::Run() { (this->*impl_)(); }
diff --git a/lite/kernels/opencl/conv_buffer_compute.h b/lite/kernels/opencl/conv_buffer_compute.h
index 531ffb5402cee45ddfd4bdd5346ec151c33b217a..80131777c3cf676a78ad318a2f889be983ade0f4 100644
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -57,7 +57,6 @@ class ConvCompute
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/conv_buffer_compute_test.cc b/lite/kernels/opencl/conv_buffer_compute_test.cc
index 2060bd1f83c39fc06c4896e1fdce295cedc37675..7f7ce271e5d061c1ec1d8b64700676cb4544598b 100644
--- a/lite/kernels/opencl/conv_buffer_compute_test.cc
+++ b/lite/kernels/opencl/conv_buffer_compute_test.cc
@@ -304,25 +304,14 @@ TEST(conv2d, compute_conv2d_1x1) {
                 // run opencl kernel
                 kernel->Launch();
 
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Buffer>();
-                auto it = wait_list->find(out_ptr);
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                  double start_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                  double stop_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                            << " us.";
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
+                CLRuntime::Global()->command_queue().finish();
+                // double start_nanos =
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+                // double stop_nanos =
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+                // double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
+                // LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
+                //           << " us.";
 
                 // run cpu ref
                 auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
@@ -536,25 +525,15 @@ TEST(conv2d, compute_conv2d_gemm) {
                 // run opencl kernel
                 kernel->Launch();
 
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Buffer>();
-                auto it = wait_list->find(out_ptr);
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                  double start_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                  double stop_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                            << " us.";
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
+                CLRuntime::Global()->command_queue().finish();
+                // double start_nanos =
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+                // double stop_nanos =
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+                // double elapsed_micros = (stop_nanos - start_nanos) /
+                // 1000.0;
+                // LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
+                //           << " us.";
 
                 // run cpu ref
                 auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index 7b0e26cf43bb081994c3f92931ebfa51f0962bc0..9e5f365fdb5f8f678af4da189587d30b41bd0b41 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -26,7 +26,6 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-
 /* image kernel*/
 void ConvImageCompute::PrepareForRun() {
   const auto& param = this->Param<param_t>();
@@ -39,6 +38,8 @@ void ConvImageCompute::PrepareForRun() {
   CHECK(context.cl_context() != nullptr);
 
   filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
+  tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
+  tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
   int bs = x_dims[0];
   int c_in = x_dims[1];
   int h_out = output_dims[2];
@@ -102,6 +103,12 @@ void ConvImageCompute::PrepareForRun() {
 
   if (kernel_h == 1 && kernel_w == 1) {
     // conv2d_1x1
+    // if (param.x->dims()[1] % 4 == 0) {
+    //   kernel_func_names_.push_back("conv2d_1x1_simple");
+    // } else {
+    //   kernel_func_names_.push_back("conv2d_1x1_opt");
+    // }
+
     if (param.x->dims()[1] % 4 == 0) {
       kernel_func_names_.push_back("conv2d_1x1_simple");
     } else {
@@ -111,11 +118,18 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    // std::vector<half_t> filter_image_v(filter_image_dims[0] *
+    //                                    filter_image_dims[1] * 4);  // 4 :
+    //                                    RGBA
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d1x1opt;
     {
@@ -172,11 +186,15 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
 #endif
   } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]
@@ -192,14 +210,20 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
   } else if (kernel_w == 3 && kernel_h == 3) {
+// #define CONV3x3OPT_FALL_BACK
+#ifndef CONV3x3OPT_FALL_BACK
     // conv2d_3x3
     kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
                                         : "conv2d_3x3_opt");
@@ -207,13 +231,18 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d3x3opt;
+
     {
       int w_blk_size = 5;
       int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
@@ -229,6 +258,26 @@ void ConvImageCompute::PrepareForRun() {
                                       static_cast<size_t>(w_blk_),
                                       static_cast<size_t>(nh_blk_)};
     }
+#else
+    kernel_func_names_.push_back("conv2d_3x3");
+    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+
+    impl_ = &ConvImageCompute::Conv2d3x3;
+
+#endif
+#undef CONV3x3OPT_FALL_BACK
 
   } else if (kernel_h == 5 && kernel_w == 5) {
 #define CONV_5x5_OPT
@@ -239,11 +288,15 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d5x5;
 #else
@@ -255,11 +308,15 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d5x5opt;
     {
@@ -288,11 +345,15 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d7x7;
 
@@ -304,11 +365,15 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<half_t> filter_image_v(filter_image_dims[0] *
-                                       filter_image_dims[1] * 4);  // 4 : RGBA
-    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    tensor_hold_filter_image_->Resize(
+        {1, filter_image_dims[0], filter_image_dims[1], 4});
+
+    half_t* filter_image_data =
+        tensor_hold_filter_image_->mutable_data<half_t>();
+
+    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
+    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d7x7opt;
     {
@@ -335,16 +400,28 @@ void ConvImageCompute::PrepareForRun() {
   VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
           << " kernel_func_paths_[0]:" << kernel_func_paths_[0];
 
+  // build options
   std::string build_options_single(" -DCL_DTYPE_half");
   // relu options
-  if (relu_fused) {
-    build_options_single += " -DRELU";
-  } else if (param.activation_param.active_type ==
-             lite_api::ActivationType::kRelu6) {
-    build_options_single += " -DRELU6";
-  } else {
-    // do nothing, may add more activation fuse
+  VLOG(3) << "relu_fused:" << relu_fused
+          << " param.activation_param.active_type:"
+          << static_cast<int>(param.activation_param.active_type)
+          << " param.activation_param.has_active:"
+          << param.activation_param.has_active;
+  if (param.activation_param.has_active) {
+    if (param.activation_param.active_type ==
+        lite_api::ActivationType::kRelu) {  // Note: judge using `relu_fused`
+                                            // also is ok
+      build_options_single += " -DRELU";
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu6) {
+      build_options_single += " -DRELU6";
+    } else {
+      LOG(FATAL) << "Unsupported activation type:"
+                 << static_cast<int>(param.activation_param.active_type);
+    }
   }
+
   // bias options
   const bool has_bias = param.bias != nullptr;
   const bool is_element_wise_bias =
@@ -358,13 +435,17 @@ void ConvImageCompute::PrepareForRun() {
     CLImageConverterFolder bias_converter;
     const DDim& bias_image_dims =
         bias_converter.InitImageDimInfoWith(param.bias->dims());
-    std::vector<half_t> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
-                                     4);
+
+    tensor_hold_bias_image_->Resize(
+        {1, bias_image_dims[0], bias_image_dims[1], 4});
+
+    half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
+
     float* bias_cpu_data = param.bias->mutable_data<float>();
     bias_converter.NCHWToImage(
-        bias_cpu_data, bias_image_v.data(), param.bias->dims());
+        bias_cpu_data, bias_image_data, param.bias->dims());
     this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
+        bias_image_dims[0], bias_image_dims[1], bias_image_data);
     // convert cpu buffer bias --> gpu image --- end ----
   }
 
@@ -544,24 +625,21 @@ void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
   status = kernel.setArg(++arg_idx, default_w_blk_);
   CL_CHECK_FATAL(status);
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       local_work_size_,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 
 void ConvImageCompute::Conv2d3x3(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
+  auto kernel = kernel_;
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
@@ -604,41 +682,49 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
   } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
     new_groups = input_channel / filter_channel;
   }
-/* TODO(ysh329): mobile has no case below
-   else {
-    LOG(FATAL) << "Not support conv3x3 case with"
-               << " input_dims:" << input_dims << " output_dims:" <<
-  output_dims
-               << " filter_dims:" << filter_dims;
-  }
-*/
+  /* TODO(ysh329): mobile has no case below
+     else {
+      LOG(FATAL) << "Not support conv3x3 case with"
+                 << " input_dims:" << input_dims << " output_dims:" <<
+    output_dims
+                 << " filter_dims:" << filter_dims;
+    }
+  */
 
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "param.groups(groups):" << param.groups;
-  VLOG(4) << "new_groups:" << new_groups;
-// VLOG(4) << "default work size{c_block, w, nh}: "
-//         << "{" << c_block << ", " << w << ", " << nh << ""
-//         << "}";
-#endif
+  // const std::vector<size_t>& default_work_size =
+  //     DefaultWorkSize(output_dims,
+  //                     DDim(std::vector<DDim::value_type>{
+  //                         static_cast<int64_t>(out_image_shape["width"]),
+  //                         static_cast<int64_t>(out_image_shape["height"])}));
+
+  // int c_block = default_work_size[0];
+  // int w = default_work_size[1];
+  // int nh = default_work_size[2];
+
+  // VLOG(4) << "============ conv2d params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  // VLOG(4) << "input_c_block: " << input_c_block;
+  // VLOG(4) << "input_c: " << input_c;
+  // VLOG(4) << "input_image: " << input_image;
+  // VLOG(4) << "input_dims: " << input_dims;
+  // VLOG(4) << "filter_dims: " << filter_dims;
+  // VLOG(4) << "filter_image: " << filter_image;
+  // VLOG(4) << "output_dims: " << output_dims;
+  // VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+  //         << out_image_shape["height"];
+  // VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  // VLOG(4) << "has bias: " << has_bias;
+  // VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  // VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  // VLOG(4) << "offset: " << offset;
+  // VLOG(4) << "dilations.size : " << dilations.size();
+  // VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  // VLOG(4) << "param.groups(groups):" << param.groups;
+  // VLOG(4) << "new_groups:" << new_groups;
+  // VLOG(4) << "default work size{c_block, w, nh}: "
+  //         << "{" << c_block << ", " << w << ", " << nh << ""
+  //         << "}";
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -652,7 +738,15 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
   if (has_bias) {
     bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
   }
-  auto kernel = kernel_;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  // STL::stringstream kernel_key;
+  // kernel_key << kernel_func_names_[0] << build_options_[0];
+  // auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  // VLOG(4) << "kernel_key: " << kernel_key.str();
+  // VLOG(4) << "kernel ready ... " << kernel_key.str();
+  // VLOG(4) << "w: " << w;
 
   cl_int status;
   int arg_idx = 0;
@@ -667,9 +761,7 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
-#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
-#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -703,29 +795,27 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, new_groups);
   CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<int>(input_dims[1]));
+  CL_CHECK_FATAL(status);
 
-#ifndef LITE_SHUTDOWN_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
-#endif
+  // auto global_work_size =
+  //     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+  //                 static_cast<size_t>(default_work_size.data()[1]),
+  //                 static_cast<size_t>(default_work_size.data()[2])};
+
+  // VLOG(4) << "out_image: " << out_image;
+  // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+  //         << global_work_size[1] << "," << global_work_size[2] << "}";
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
-
-  if (is_turn) {
-    event_->wait();
-  }
 }
-
 void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
@@ -838,18 +928,16 @@ void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       local_work_size_,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 
@@ -970,18 +1058,16 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 
@@ -1090,18 +1176,16 @@ void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
 
   //  VLOG(4) << "out_image: " << out_image;
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       local_work_size_,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 
@@ -1222,19 +1306,17 @@ void ConvImageCompute::Conv2d7x7(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
 
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
@@ -1339,19 +1421,17 @@ void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       local_work_size_,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
 
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
@@ -1424,19 +1504,17 @@ void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
   status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
   CL_CHECK_FATAL(status);
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       local_work_size_,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(output_img, event_);
 
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 
@@ -1523,19 +1601,17 @@ void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
   status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
   CL_CHECK_FATAL(status);
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(output_img, event_);
 
   if (is_turn) {
-    event_->wait();
+    CLRuntime::Global()->command_queue().finish();
   }
 }
 
@@ -1656,21 +1732,18 @@ void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
   CL_CHECK_FATAL(status);
 
 #ifndef LITE_SHUTDOWN_LOG
-  //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_image, event_);
 }
 
 void ConvImageCompute::Run() { (this->*impl_)(false); }
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index fbc659b50d55b3289209228e2ac52a3a19504d57..3b5faa0c420bd895dc2f1dd00c7ddfbaa661b60f 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -58,9 +58,11 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
+
   std::unique_ptr<Tensor> filter_gpu_image_{nullptr};
   std::unique_ptr<Tensor> bias_gpu_image_{nullptr};
+  std::unique_ptr<Tensor> tensor_hold_filter_image_{nullptr};
+  std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
   cl::NDRange global_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   int c_blk_ = 1;
diff --git a/lite/kernels/opencl/conv_image_compute_test.cc b/lite/kernels/opencl/conv_image_compute_test.cc
index 5563265198a992bcf0b4fbb6e22168e8aeb50e33..f388719d76b18ce862567984f241b33b0c7fc881 100644
--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -121,6 +121,7 @@ static void conv_basic(const Dtype1* din,
     }
   }
 }
+
 int ConvOutputSize(int input_size,
                    int filter_size,
                    int dilation,
@@ -197,15 +198,23 @@ TEST(conv2d, compute_image2d_1x1) {
               if (bias_flag) {
                 param.bias = &bias;
               }
+
               if (relu_flag == "relu") {
-                param.fuse_relu = true;
+                param.fuse_relu = true;  // relu only
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu;
               } else if (relu_flag == "None") {
                 param.fuse_relu = false;
+                param.activation_param.has_active = false;
               } else if (relu_flag == "relu6") {
                 param.activation_param.Relu_clipped_coef = 6.f;
                 param.activation_param.has_active = true;
                 param.activation_param.active_type =
                     lite_api::ActivationType::kRelu6;
+              } else {
+                param.fuse_relu = false;  // relu only
+                param.activation_param.has_active = false;
               }
 
               std::vector<int> paddings = {pad, pad, pad, pad};
@@ -259,7 +268,7 @@ TEST(conv2d, compute_image2d_1x1) {
               const size_t cl_image2d_slice_pitch{0};
 
               std::default_random_engine engine;
-              std::uniform_real_distribution<float> gen(-5, 5);
+              std::uniform_real_distribution<float> gen(-2, 2);
 
               std::vector<float> input_v(batch_size * ic * ih * iw);
               std::vector<float> filter_v(oc * ic * ksize * ksize);
@@ -336,11 +345,6 @@ TEST(conv2d, compute_image2d_1x1) {
               for (int i = 0; i < x_image_v.size(); i++) {
                 SHADOW_LOG << "(" << i << ")" << Half2Float(x_image_v[i]);
               }
-              //                auto* filter_image2d =
-              //                filter.mutable_data<uint16_t, cl::Image2D>(
-              //                    filter_image_width,
-              //                    filter_image_height,
-              //                    filter_image_v.data());
               SHADOW_LOG << "卷积核 : ----  ";
               for (int i = 0; i < filter_v.size(); i++) {
                 SHADOW_LOG << "(" << i << ")" << filter_v[i];
@@ -369,15 +373,6 @@ TEST(conv2d, compute_image2d_1x1) {
                 }
                 bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
                                                              bias_dim);
-                //                CLImageConverterFolder folder_convertor;
-                //                folder_convertor.NCHWToImage(
-                //                    bias_v.data(), bias_image_v.data(),
-                //                    bias_dim);
-                //
-                //                auto* bias_data = bias.mutable_data<float,
-                //                cl::Image2D>(
-                //                    bias_image_width, bias_image_height,
-                //                    bias_image_v.data());
               }
 
               SHADOW_LOG << "resize output  ...";
@@ -395,19 +390,7 @@ TEST(conv2d, compute_image2d_1x1) {
               auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target"
-                              "cl tensor.";
-              }
+              CLRuntime::Global()->command_queue().finish();
 
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                           output.data<half_t, cl::Image2D>(),
@@ -530,11 +513,11 @@ const int stride = 2;
                 const int iw = 3;
                 const int oc = 2;
 #else  // big scale with group
-  const int stride = 1;
-  const int group = 32 / 1;
-  const int batch_size = 2;
-  const int ic = 32 / 1;
-  const int ih = 112 / 1;
+  const int stride = 2;
+  const int group = 1;
+  const int batch_size = 1;
+  const int ic = 3 / 1;
+  const int ih = 224 / 1;
   const int iw = 112 / 1;
   const int oc = 32 / 1;
 #endif
@@ -558,9 +541,6 @@ const int stride = 2;
                                                   PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault));
               ASSERT_FALSE(kernels.empty());
-              //              CHECK(batch_size == 1) << "conv3x3 only supprt
-              //              batch_size == 1";
-
               auto kernel = std::move(kernels.front());
               SHADOW_LOG << "created conv2d kernel";
 
@@ -575,15 +555,23 @@ const int stride = 2;
               if (bias_flag) {
                 param.bias = &bias;
               }
+
               if (relu_flag == "relu") {
-                param.fuse_relu = true;
+                param.fuse_relu = true;  // relu only
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu;
               } else if (relu_flag == "None") {
                 param.fuse_relu = false;
+                param.activation_param.has_active = false;
               } else if (relu_flag == "relu6") {
                 param.activation_param.Relu_clipped_coef = 6.f;
                 param.activation_param.has_active = true;
                 param.activation_param.active_type =
                     lite_api::ActivationType::kRelu6;
+              } else {
+                param.fuse_relu = false;  // relu only
+                param.activation_param.has_active = false;
               }
 
               std::vector<int> paddings = {pad, pad, pad, pad};
@@ -643,7 +631,7 @@ const int stride = 2;
               const size_t cl_image2d_slice_pitch{0};
 
               std::default_random_engine engine;
-              std::uniform_real_distribution<float> gen(-5, 5);
+              std::uniform_real_distribution<float> gen(-2, 2);
 
               std::vector<float> input_v(batch_size * ic * ih * iw);
               std::vector<float> filter_v(oc * filter_channel * ksize * ksize);
@@ -652,10 +640,10 @@ const int stride = 2;
 
               SHADOW_LOG << "gen input and filter ...";
               for (int i = 0; i < input_v.size(); ++i) {
-                input_v[i] = i * 0.001;  // gen(engine);
+                input_v[i] = gen(engine);
               }
               for (int i = 0; i < filter_v.size(); ++i) {
-                filter_v[i] = 1 * 0.001;  // gen(engine);
+                filter_v[i] = gen(engine);
               }
 
               SHADOW_LOG << "after gen input and filter ...";
@@ -724,28 +712,12 @@ const int stride = 2;
               // assign filter as target arm
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                              filter_dim);
-              // filter kernel
-              //              auto* filter_image2d = filter.mutable_data<float,
-              //              cl::Image2D>(
-              //                  filter_image_width,
-              //                  filter_image_height,
-              //                  filter_image_v.data());
-
               if (bias_flag) {
                 for (int i = 0; i < bias_dim.production(); ++i) {
                   bias_v[i] = static_cast<int>(gen(engine));
                 }
                 bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
                                                              bias_dim);
-                //                CLImageConverterFolder folder_convertor;
-                //                folder_convertor.NCHWToImage(
-                //                    bias_v.data(), bias_image_v.data(),
-                //                    bias_dim);
-                //
-                //                auto* bias_data = bias.mutable_data<float,
-                //                cl::Image2D>(
-                //                    bias_image_width, bias_image_height,
-                //                    bias_image_v.data());
               }
 
               SHADOW_LOG << "resize output  ...";
@@ -763,20 +735,7 @@ const int stride = 2;
               auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
-
+              CLRuntime::Global()->command_queue().finish();
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                           output.data<half_t, cl::Image2D>(),
                                           out_image_width,
@@ -848,8 +807,13 @@ const int stride = 2;
               for (int i = 0; i < out_dim.production(); i++) {
                 auto relative_diff =
                     COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                EXPECT_LT(relative_diff, FP16_MAX_DIFF);
-                if (relative_diff > FP16_MAX_DIFF) {
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                // EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                // EXPECT_LT(abs_diff, FP16_ABS_DIFF);
+
+                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
+                             abs_diff > FP16_ABS_DIFF);
+                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
                   LOG(FATAL) << "error idx:" << i << "output_v[" << i
                              << "]:" << output_v[i] << " "
                                                        "out_ref_data["
@@ -932,14 +896,21 @@ TEST(conv2d, compute_image2d_5x5) {
                 param.bias = &bias;
               }
               if (relu_flag == "relu") {
-                param.fuse_relu = true;
+                param.fuse_relu = true;  // relu only
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu;
               } else if (relu_flag == "None") {
                 param.fuse_relu = false;
+                param.activation_param.has_active = false;
               } else if (relu_flag == "relu6") {
                 param.activation_param.Relu_clipped_coef = 6.f;
                 param.activation_param.has_active = true;
                 param.activation_param.active_type =
                     lite_api::ActivationType::kRelu6;
+              } else {
+                param.fuse_relu = false;  // relu only
+                param.activation_param.has_active = false;
               }
 
               std::vector<int> paddings = {pad, pad, pad, pad};
@@ -999,7 +970,7 @@ TEST(conv2d, compute_image2d_5x5) {
               const size_t cl_image2d_slice_pitch{0};
 
               std::default_random_engine engine;
-              std::uniform_real_distribution<float> gen(-5, 5);
+              std::uniform_real_distribution<float> gen(-2, 2);
 
               std::vector<float> input_v(batch_size * ic * ih * iw);
               std::vector<float> filter_v(oc * ic * ksize * ksize);
@@ -1076,28 +1047,12 @@ TEST(conv2d, compute_image2d_5x5) {
               // assign filter as target arm
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                              filter_dim);
-              // filter kernel
-              //              auto* filter_image2d = filter.mutable_data<float,
-              //              cl::Image2D>(
-              //                  filter_image_width,
-              //                  filter_image_height,
-              //                  filter_image_v.data());
-
               if (bias_flag) {
                 for (int i = 0; i < bias_dim.production(); ++i) {
                   bias_v[i] = static_cast<int>(gen(engine));
                 }
                 bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
                                                              bias_dim);
-                //                CLImageConverterFolder folder_convertor;
-                //                folder_convertor.NCHWToImage(
-                //                    bias_v.data(), bias_image_v.data(),
-                //                    bias_dim);
-                //
-                //                auto* bias_data = bias.mutable_data<float,
-                //                cl::Image2D>(
-                //                    bias_image_width, bias_image_height,
-                //                    bias_image_v.data());
               }
 
               SHADOW_LOG << "resize output  ...";
@@ -1115,19 +1070,7 @@ TEST(conv2d, compute_image2d_5x5) {
               auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
+              CLRuntime::Global()->command_queue().finish();
 
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                           output.data<half_t, cl::Image2D>(),
@@ -1276,16 +1219,25 @@ TEST(conv2d, compute_image2d_7x7) {
               if (bias_flag) {
                 param.bias = &bias;
               }
+
               if (relu_flag == "relu") {
-                param.fuse_relu = true;
+                param.fuse_relu = true;  // relu only
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu;
               } else if (relu_flag == "None") {
                 param.fuse_relu = false;
+                param.activation_param.has_active = false;
               } else if (relu_flag == "relu6") {
                 param.activation_param.Relu_clipped_coef = 6.f;
                 param.activation_param.has_active = true;
                 param.activation_param.active_type =
                     lite_api::ActivationType::kRelu6;
+              } else {
+                param.fuse_relu = false;  // relu only
+                param.activation_param.has_active = false;
               }
+
               std::vector<int> paddings = {pad, pad, pad, pad};
               std::vector<int> dilations = {dilation, dilation};
 
@@ -1337,7 +1289,7 @@ TEST(conv2d, compute_image2d_7x7) {
               const size_t cl_image2d_slice_pitch{0};
 
               std::default_random_engine engine;
-              std::uniform_real_distribution<float> gen(-5, 5);
+              std::uniform_real_distribution<float> gen(-2, 2);
 
               std::vector<float> input_v(batch_size * ic * ih * iw);
               std::vector<float> filter_v(oc * ic * ksize * ksize);
@@ -1428,29 +1380,12 @@ TEST(conv2d, compute_image2d_7x7) {
               // assign filter as target arm
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                              filter_dim);
-
-              //              auto* filter_image2d =
-              // filter.mutable_data < float,
-              //              cl::Image2D>(
-              //                  filter_image_width,
-              //                  filter_image_height,
-              //                  filter_image_v.data());
-
               if (bias_flag) {
                 for (int i = 0; i < bias_dim.production(); ++i) {
                   bias_v[i] = static_cast<int>(gen(engine));
                 }
                 bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
                                                              bias_dim);
-                //                CLImageConverterFolder folder_convertor;
-                //                folder_convertor.NCHWToImage(
-                //                    bias_v.data(), bias_image_v.data(),
-                //                    bias_dim);
-                //
-                //                auto* bias_data = bias.mutable_data<float,
-                //                cl::Image2D>(
-                //                    bias_image_width, bias_image_height,
-                //                    bias_image_v.data());
               }
 
               SHADOW_LOG << "resize output  ...";
@@ -1468,19 +1403,7 @@ TEST(conv2d, compute_image2d_7x7) {
               auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
+              CLRuntime::Global()->command_queue().finish();
 
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                           output.data<half_t, cl::Image2D>(),
diff --git a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
index 5f7950b060ac77b7d28053ef209c26b9bd9cf24f..ae03c2a1828a4993d136c30182d25607fea3230b 100644
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -108,23 +108,21 @@ class DepthwiseConv2dCompute
     status = kernel.setArg(++arg_idx, *bias_buf);
     CL_CHECK_FATAL(status);
     auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
   }
 
  private:
   std::string kernel_func_name_{"depthwise_conv2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
index 40cfdfffab452a004d45d804f62309dc71e0b0d9..431fbfcc49656e3246bc2893a252971d89cc391a 100644
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
@@ -137,16 +137,7 @@ TEST(depthwise_conv2d_buffer_fp32, compute) {
   output.Resize({4, 32, 110, 110});
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   lite::Tensor output_ref;
   output_ref.Resize({4, 32, 110, 110});
diff --git a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
index 79662bd9e346c94ffe3e6fa30ce8c195700cd313..e36be300ba5d8b961c0bb9a0ad86ae121bd9e8f2 100644
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
@@ -312,19 +312,7 @@ TEST(depthwise_conv2d, compute_basic) {
           auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
               out_image_width, out_image_height);
 
-          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-          auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-          auto it = wait_list->find(out_ptr);
-
-          if (it != wait_list->end()) {
-            VLOG(4) << "--- Find the sync event for the target cl "
-                       "tensor. ---";
-            auto& event = *(it->second);
-            event.wait();
-          } else {
-            LOG(FATAL) << "Could not find the sync event for the target "
-                          "cl tensor.";
-          }
+          CLRuntime::Global()->command_queue().finish();
 
           TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                       output.data<half_t, cl::Image2D>(),
@@ -503,20 +491,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
 
         kernel->Launch();
 
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-        auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-          LOG(INFO)
-              << "Could not find the sync event for the target cl tensor.";
-        }
+        CLRuntime::Global()->command_queue().finish();
 
         lite::Tensor out_ref;
         out_ref.Resize(output_dim);
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
index 27c7ebaa5a3f2abee2fc58cf3e137fe250ddd6bf..ff9d18430bc06f0800086484698cce1405c56167 100644
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -89,23 +89,20 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  private:
   std::string kernel_func_name_{"dropout"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/dropout_image_compute_test.cc b/lite/kernels/opencl/dropout_image_compute_test.cc
index 0d377f169c6a60a80b30e5846337951b495fa0ed..0e58acc87e415aac1970eafc5fb68636fcba82bd 100644
--- a/lite/kernels/opencl/dropout_image_compute_test.cc
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
@@ -86,16 +86,7 @@ TEST(dropout_image2d_fp16, compute) {
   LOG(INFO) << "out_image:" << out_image;
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   dropout(input_v.data(), in_dim, out_ref.get(), 0.6);
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
index f8ae61bacbba9a9595b96435e47d36107d8fc74a..237de7b6fad9dc2e03de37e15f7078c487635ce7 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -63,16 +63,10 @@ void ElementwiseAddCompute::Run() {
   CL_CHECK_FATAL(status);
 
   auto global_work_size = cl::NDRange{channels_, batch_};
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_buf, event_);
 }
 
 void ElementwiseAddCompute::UpdateParams() {
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.h b/lite/kernels/opencl/elementwise_add_buffer_compute.h
index 4a26e283fd02a4f7c4f7ade20de79a3fe7838019..c60b8512c3ede9e7c1919dc3f140faabe7204544 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -48,7 +48,6 @@ class ElementwiseAddCompute
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
index 67b56c747757499574bf5e9ac7535a366ce343da..de8acff91df80eaf2434e9682d3f1489587a3248 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
@@ -144,16 +144,7 @@ TEST(elementwise_add_buffer, compute) {
 
   kernel->Launch();
 
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   elementwise_compute_ref<float>(
@@ -225,16 +216,7 @@ TEST(fusion_elementwise_add_activation_buffer, compute) {
 
   kernel->Launch();
 
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   elementwise_compute_ref<float>(
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index 3b848954439d95eaa39616a22e6c6af67dc7d5fa..c507dcb43da35f6912f98a89416a34e10012bdc0 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -153,16 +153,15 @@ void ElementwiseAddImageCompute::Run() {
 
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
       global_work_size_,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_img, event_);
 }
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h
index 196e3c499e700022c56f8cae919c67235e7b09db..fae21f3d713b9148d281915c0f12b119b97bc21c 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -63,7 +63,6 @@ class ElementwiseAddImageCompute
   cl::Kernel kernel_;
   cl::NDRange global_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc
index 19d8cfa03668cbfc7ffb951479ae7d84c1fc03c0..25764f1dc2bc2965f9f0be74bf6b86e9f4266318 100644
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -50,8 +50,10 @@ void ElementwiseMulFloatImageCompute::PrepareForRun() {
   VLOG(4) << "y_dims.size():" << y_dims.size();
 
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "image/elementwise_mul_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
 }
 
 void ElementwiseMulFloatImageCompute::Run() {
@@ -88,7 +90,7 @@ void ElementwiseMulFloatImageCompute::Run() {
           << out_img_shape[1];
 
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int arg_idx = 0;
@@ -150,16 +152,16 @@ void ElementwiseMulFloatImageCompute::Run() {
 
   auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                       static_cast<cl::size_type>(x_img_height)};
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
   auto  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel
       kernel,
       cl::NullRange,
       global_work_size,
       cl::NullRange,
       nullptr,
-      event_.get());
+      nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_img, event_);
+  std::string time_stamp_{GetTimeStamp()};
 
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 }
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index 23b0a20ba39b0890ee10dc03b6e80756f5724419..1f17d60097b95f67cd65b2745f7f0ce5623bdc50 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -185,16 +185,15 @@ class ElementwiseMulImageCompute
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>(x_img_width),
                     static_cast<cl::size_type>(x_img_height)};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif
@@ -205,7 +204,6 @@ class ElementwiseMulImageCompute
   std::string kernel_func_name_{"elementwise_mul"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index 33cb55b6966bb7e04070289614ac83cc898f05c4..cae6338959fd93810fc885e59d2c574de489af7c 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -138,16 +138,9 @@ void ElementwiseSubImageCompute::Run() {
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif
 
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
   auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
   CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_img, event_);
 }
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.h b/lite/kernels/opencl/elementwise_sub_image_compute.h
index 29507a9775aafe202bcdf58187966317a1902ff3..cc1ce505c63b58e92a587f2f45eb9f945ddffeb0 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -46,7 +46,6 @@ class ElementwiseSubImageCompute
   std::string kernel_func_name_{"elementwise_sub"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index 1e24020f2ad3f4a6f8dda4348c9a291b7a04868f..107575ac6d0cd21358d1ccbe4ba9d0834a445bcd 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -123,16 +123,15 @@ class FcCompute
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size_,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
   }
 
  private:
@@ -145,7 +144,6 @@ class FcCompute
   DDim last_x_dims_;
   cl::NDRange global_work_size_;
   cl::Kernel kernel_;
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc
index 78dfbdffb965e82fde23d0a03e87cccce7812a17..4c9c8c47e4306c92486dd1b847884200959453dd 100644
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -162,8 +162,9 @@ TEST(fc, compute) {
 
         // run opencl kernel
         kernel->Launch();
-        //       kernel->Launch();
+        CLRuntime::Global()->command_queue().finish();
 
+#if 0  // NOTE(ysh329): note event
         auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
         auto* out_ptr = param.output->data<float, cl::Buffer>();
         auto it = wait_list->find(out_ptr);
@@ -171,20 +172,18 @@ TEST(fc, compute) {
           VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
           auto& event = *(it->second);
           event.wait();
-          auto command_queue = CLRuntime::Global()->command_queue();
-          command_queue.finish();
-#if 0
+        CLRuntime::Global()->command_queue().finish();
           double start_nanos =
               event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
           double stop_nanos =
               event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
           double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
           LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
-#endif
         } else {
           LOG(FATAL)
               << "Could not find the sync event for the target cl tensor.";
         }
+#endif
 
         std::vector<float> out_data_from_gpu(out_dim.production());
         TargetWrapperCL::MemcpySync(
@@ -214,18 +213,17 @@ TEST(fc, compute) {
                                            out_data_from_gpu.data()[eidx]);
           auto relative_diff = COMPUTE_RELATIVE_DIFF(
               out_ref_data[eidx], out_data_from_gpu.data()[eidx]);
-          // EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
-          //              (abs_diff <= FP16_MAX_DIFF),
-          //          true);
+          EXPECT_EQ(
+              (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
           if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
-            LOG(ERROR) << "error idx:" << eidx << ", out_ref_data[" << eidx
+            LOG(FATAL) << "error idx:" << eidx << ", out_ref_data[" << eidx
                        << "]:" << out_ref_data[eidx]
                        << ", out_data_from_gpu.data()[" << eidx
                        << "]:" << out_data_from_gpu.data()[eidx]
                        << " abs_diff:" << abs_diff
                        << " relative_diff:" << relative_diff
                        << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
-            return;
           }
         }
 
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
index 0d2cc348960ff5ef6412bf58dd7ce9a4f2ecc19d..c4daf6ae4222e498726f24e0ba10d12f6f4918af 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -130,16 +130,15 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size_,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  protected:
@@ -154,7 +153,6 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/grid_sampler_image_compute_test.cc b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
index 8d4ecd12695479161821823402dab01a72265264..0265a7b6aeff4c9845ea3a5654d63e8ef4a8e44d 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute_test.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
@@ -191,17 +191,7 @@ TEST(grid_samler_image2d, compute) {
           // LOG(INFO) << "out_image:" << out_image;
           kernel->Launch();
 
-          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-          auto* out_ptr = param.out->data<half_t, cl::Image2D>();
-          auto it = wait_list->find(out_ptr);
-          if (it != wait_list->end()) {
-            VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-            auto& event = *(it->second);
-            event.wait();
-          } else {
-            LOG(FATAL)
-                << "Could not find the sync event for the target cl tensor.";
-          }
+          CLRuntime::Global()->command_queue().finish();
 
           std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
           gird_sampler_ref(
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index 41acb5f8d457d047c0396c563006b4b4a31268b8..bf7c2aab35ebeae2f64960721f6b23d1c04c1ddc 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -137,16 +137,14 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(7, *out_img);
     CL_CHECK_FATAL(status);
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         local_work_size,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
 #else  // paddle version
@@ -260,16 +258,14 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(arg_idx++, in_w);
     CL_CHECK_FATAL(status);
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         local_work_size,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 #endif
 
@@ -278,7 +274,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"instance_norm_onnx"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
+
   Tensor scale_image_;
   Tensor bias_image_;
 };
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
index eaabf7f37bf1434e8a451fc797e72d706e68ce5b..f981c5ca11a456ff649ba975a9ed63372f80f6ce 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -105,20 +105,11 @@ class IoCopykOpenCLToHostCompute
     }
 
     auto& context = ctx_->As<OpenCLContext>();
-    auto* wait_list = context.cl_wait_list();
 
-    auto it = wait_list->find(x_ptr);
-    if (it != wait_list->end()) {
 #ifndef LITE_SHUTDOWN_LOG
-      VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
+    VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
 #endif
-      auto& event = *(it->second);
-      event.wait();
-      auto command_queue = CLRuntime::Global()->command_queue();
-      command_queue.finish();
-    } else {
-      LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-    }
+    CLRuntime::Global()->command_queue().finish();
 
     CopyToHostSync(data, param.x->raw_data(), mem_size);
   }
diff --git a/lite/kernels/opencl/io_copy_buffer_compute_test.cc b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
index 320e257d39c8ada76a76765d403ed8ae65ee0e74..cc2d0df031f3f4d9f6dda617207441e1d80b1d75 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
@@ -65,10 +65,7 @@ TEST(io_copy, compute) {
 
   h2d_kernel->Launch();
   auto* event_key = d_y.data<float, cl::Buffer>();
-  std::shared_ptr<cl::Event> event(new cl::Event);
-  context->As<OpenCLContext>().cl_wait_list()->emplace(event_key, event);
   d2h_kernel->Launch();
-
   auto* h_y_data = h_y.data<float>();
 
   for (int i = 0; i < 3 * 9 * 28 * 28; i++) {
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
index e35cd6e5fb59cfada85fb5beaff758d6262f51b4..ce2242661144e11f1e042011919353f29e0440a8 100644
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -44,8 +44,10 @@ class LayoutComputeBufferChwToImageDefault
     }
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/layout_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -95,7 +97,7 @@ class LayoutComputeBufferChwToImageDefault
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -116,22 +118,24 @@ class LayoutComputeBufferChwToImageDefault
     status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
     CL_CHECK_FATAL(status);
 
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
             << " " << (new_dims[0] * new_dims[2]);
+#endif
+
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
                     static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(y_data, event_);
   }
 
   std::string doc() const override {
@@ -140,9 +144,9 @@ class LayoutComputeBufferChwToImageDefault
   }
 
  private:
+  std::string time_stamp_{GetTimeStamp()};
   std::string kernel_func_name_{"buffer_to_image2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 // [ImageDefault] -> [NCHW]
@@ -158,8 +162,10 @@ class LayoutComputeImageDefaultToBufferChw
     }
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/layout_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -202,7 +208,7 @@ class LayoutComputeImageDefaultToBufferChw
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -230,16 +236,15 @@ class LayoutComputeImageDefaultToBufferChw
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
                     static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(y_data, event_);
   }
 
   std::string doc() const override {
@@ -248,9 +253,9 @@ class LayoutComputeImageDefaultToBufferChw
   }
 
  private:
+  std::string time_stamp_{GetTimeStamp()};
   std::string kernel_func_name_{"image2d_to_buffer"};
   std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 // [NCHW] -> [ImageDW]
@@ -263,8 +268,10 @@ class LayoutComputeBufferChwToImage2DNw
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -298,7 +305,7 @@ class LayoutComputeBufferChwToImage2DNw
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -325,16 +332,15 @@ class LayoutComputeBufferChwToImage2DNw
         cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
                     static_cast<cl::size_type>(out_W),            // w
                     static_cast<cl::size_type>(out_C * out_H)};   // ch
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(y_data, event_);
   }
 
   std::string doc() const override {
@@ -342,9 +348,10 @@ class LayoutComputeBufferChwToImage2DNw
   }
 
  private:
+  std::string time_stamp_{GetTimeStamp()};
+
   std::string kernel_func_name_{"buffer_to_image2d_nw"};
   std::string build_options_{"-DCL_DTYPE_float "};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/layout_image_compute_test.cc b/lite/kernels/opencl/layout_image_compute_test.cc
index 9cdfbe0a1d64176db9dfd2698ab3ab0631a4b118..b3ca8991cffb63fa93a97baf3eb06f6a791e523d 100644
--- a/lite/kernels/opencl/layout_image_compute_test.cc
+++ b/lite/kernels/opencl/layout_image_compute_test.cc
@@ -246,20 +246,7 @@ TEST(layout_ImageDefault_With_Pre_Post, compute) {
           LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
           img_to_buf_kernel->Launch();
 
-          // wait for opencl
-          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-          auto* out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-          auto it = wait_list->find(out_ptr);
-
-          if (it != wait_list->end()) {
-            VLOG(4) << "--- Find the sync event for the target cl "
-                       "tensor. ---";
-            auto& event = *(it->second);
-            event.wait();
-          } else {
-            LOG(FATAL) << "Could not find the sync event for the target "
-                          "cl tensor.";
-          }
+          CLRuntime::Global()->command_queue().finish();
 
 // result
 #ifdef PRINT_RESULT
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
index 1595987495f4a37ec89a8c9f91e9403c72c45b79..91e94fd4a508bee169f9030aa033136b13607382 100644
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -128,16 +128,14 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
@@ -154,7 +152,6 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{"lrn"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/lrn_image_compute_test.cc b/lite/kernels/opencl/lrn_image_compute_test.cc
index 9a0fbabbe5f538b09e8ac6e694e96aa512ea6aa3..88e2d695d990f6bf59564a8a02caf418be436a26 100644
--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
@@ -181,19 +181,7 @@ TEST(lrn_image2d, compute) {
                     // LOG(INFO) << "out_image:" << out_image;
                     kernel->Launch();
 
-                    auto* wait_list =
-                        context->As<OpenCLContext>().cl_wait_list();
-                    auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
-                    auto it = wait_list->find(out_ptr);
-                    if (it != wait_list->end()) {
-                      VLOG(4) << "--- Find the sync event for the target cl "
-                                 "tensor. ---";
-                      auto& event = *(it->second);
-                      event.wait();
-                    } else {
-                      LOG(FATAL) << "Could not find the sync event for the "
-                                    "target cl tensor.";
-                    }
+                    CLRuntime::Global()->command_queue().finish();
 
                     std::unique_ptr<float[]> out_ref(
                         new float[out_dim.production()]);
diff --git a/lite/kernels/opencl/mul_buffer_compute.cc b/lite/kernels/opencl/mul_buffer_compute.cc
index 4ca760b76087112f111f6be71a99c888493c39a1..7877a7fde69d9e8a8e9a7c262736b5b8cd23d1c3 100644
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -91,16 +91,15 @@ class MulCompute
 
     auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                         static_cast<size_t>((n_ + 3) / 4)};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
   }
 
  private:
@@ -108,7 +107,6 @@ class MulCompute
   std::string kernel_func_name_{"mat_mul"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/mul_buffer_compute_test.cc b/lite/kernels/opencl/mul_buffer_compute_test.cc
index e35eca658c251ba4c203b49ab214f5cbbdb5ec81..e6ca319c1b865473a275b9e8afaf6c3ec0baa3b6 100644
--- a/lite/kernels/opencl/mul_buffer_compute_test.cc
+++ b/lite/kernels/opencl/mul_buffer_compute_test.cc
@@ -123,17 +123,7 @@ TEST(mul, compute) {
         // run opencl kernel
         kernel->Launch();
 
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-        auto* out_ptr = param.output->data<float, cl::Buffer>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-        }
+        CLRuntime::Global()->command_queue().finish();
 
         // run cpu ref
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
index b61b585e441e9e39ca0fbbec4f7f20c28614df43..a2bb29da3ccefdf9298a66187698cc699baf08b8 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -109,23 +109,21 @@ class NearestInterpComputeImageDefault
         cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  private:
   std::string kernel_func_name_{"nearest_interp"};
   std::string build_options_{" -DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/nearest_interp_image_compute_test.cc b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
index a91e853a865f6abb2536606be6628e860cf7d6b9..4a9948832d1a96d95a7f317bd3ac8245292ae02b 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
@@ -208,20 +208,7 @@ TEST(nearest_interp_image2d, compute) {
               LOG(INFO) << "run kernel: img_to_buf_kernel";
               img_to_buf_kernel->Launch();
 
-              // wait for opencl
-              auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-              auto it = wait_list->find(out_ptr);
-
-              if (it != wait_list->end()) {
-                VLOG(4) << "--- Find the sync event for the target cl "
-                           "tensor. ---";
-                auto &event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
+              CLRuntime::Global()->command_queue().finish();
 
               // compute ref cpu
               for (int nid = 0; nid < x_dim[0]; ++nid) {
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
index a22622af1ee79ffce5ecdee278482e5e96f482cf..3318825f2ba5ebe60340a179f12f37a1b92fb5e6 100644
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -142,16 +142,14 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size[1]),
                     static_cast<cl::size_type>(default_work_size[2])};
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
             << global_work_size[1] << " " << global_work_size[2];
@@ -163,7 +161,6 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pad2d_image_compute_test.cc b/lite/kernels/opencl/pad2d_image_compute_test.cc
index c2371d07f31caf569cfe4b299bf2f88373eb3b9f..82b3c0a9018c74723a037d8d0374acb139c232e0 100644
--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
@@ -262,22 +262,8 @@ TEST(pad2d_image2d, compute) {
                       img_to_buf_kernel->Launch();
 
                       // wait for opencl
-                      auto *wait_list =
-                          context->As<OpenCLContext>().cl_wait_list();
-                      auto *out_ptr =
-                          ImageToBufferParam.y->data<float, cl::Buffer>();
-                      auto it = wait_list->find(out_ptr);
 
-                      if (it != wait_list->end()) {
-                        VLOG(4) << "--- Find the sync event for the target cl "
-                                   "tensor. ---";
-                        auto &event = *(it->second);
-                        event.wait();
-                      } else {
-                        LOG(FATAL)
-                            << "Could not find the sync event for the target "
-                               "cl tensor.";
-                      }
+                      CLRuntime::Global()->command_queue().finish();
 
                       // compute ref cpu
                       pad2d_ref(mapped_x,
diff --git a/lite/kernels/opencl/pool_buffer_compute.cc b/lite/kernels/opencl/pool_buffer_compute.cc
index 7de86869ed37940756abde15c825da85924b5b3f..9be0775d99cbacd5cfefc1e8cd68afc7f2ac229c 100644
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -105,23 +105,21 @@ class PoolCompute
     status = kernel.setArg(++arg_idx, *output_buf);
     CL_CHECK_FATAL(status);
     auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
+
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
   }
 
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_float"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pool_buffer_compute_test.cc b/lite/kernels/opencl/pool_buffer_compute_test.cc
index ff24477cb9276568679f685164ac74c2cfaae603..9abcb2027c8d5860db82a77c996ac63ed0503ac5 100644
--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
@@ -119,16 +119,7 @@ TEST(pool2d_buffer_fp32, compute) {
 
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
index 83f9107d31cdfa3f73a98e08126b792bde828383..e0f09d65a1f6eefb8cbca5a5b229f3fda78a9396 100644
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -150,23 +150,20 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
     CL_CHECK_FATAL(status);
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/pool_image_compute_test.cc b/lite/kernels/opencl/pool_image_compute_test.cc
index 52aa93d9fddac2c59e1b2bf1c149fb8949e8efa9..4b6ec316b2b83f2e7a0b21a3c1317911edaffde4 100644
--- a/lite/kernels/opencl/pool_image_compute_test.cc
+++ b/lite/kernels/opencl/pool_image_compute_test.cc
@@ -137,16 +137,7 @@ TEST(pool2d_image2d, compute) {
   LOG(INFO) << "out_image:" << out_image;
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   pool_avg(0, 0, 1, 1, 7, 7, input_v.data(), in_dim, out_ref.get(), out_dim);
diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
index 9feffed20461dc49a5d95c7b3092eb195e1e0dc6..900b91060157b88ea9eef421730382a77b9b6e5d 100644
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -154,23 +154,20 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
                     static_cast<size_t>(default_work_size.data()[1]),
                     static_cast<size_t>(default_work_size.data()[2])};
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
   }
 
  private:
   std::string kernel_func_name_{"reshape"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/reshape_image_compute_test.cc b/lite/kernels/opencl/reshape_image_compute_test.cc
index 950e0978558a45fe27142e8ee0b72c0b1442b79b..0f40e13acacc5a27805efff1fc13a7938fe9603e 100644
--- a/lite/kernels/opencl/reshape_image_compute_test.cc
+++ b/lite/kernels/opencl/reshape_image_compute_test.cc
@@ -184,17 +184,7 @@ TEST(reshape_opencl, compute) {
   LOG(INFO) << "kernel launch ...";
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_image);
-
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   half_t* out_image_data = new half_t[out_image_shape.production() * 4];
   TargetWrapperCL::ImgcpySync(out_image_data,
diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
index 4f5b7f754686eada24a0cc3389e73b06218a0f94..bb1d6f8e66925d3024771d8230297f045c74ffab 100644
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -93,23 +93,20 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
     status = kernel.setArg(3, bias);
     CL_CHECK_FATAL(status);
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size_,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  private:
   std::string kernel_func_name_{"scale"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 
   param_t* scale_param_{nullptr};
   cl::Kernel kernel_;
diff --git a/lite/kernels/opencl/scale_image_compute_test.cc b/lite/kernels/opencl/scale_image_compute_test.cc
index c9461ffbb8a2941fc51c58f7ddaa42293a4a5ffe..3cca09769cd017cf54ea968c9ddf1d1c9ac56090 100644
--- a/lite/kernels/opencl/scale_image_compute_test.cc
+++ b/lite/kernels/opencl/scale_image_compute_test.cc
@@ -88,16 +88,7 @@ TEST(scale_image2d_fp32, compute) {
   LOG(INFO) << "out_image:" << out_image;
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   scale(input_v.data(), in_dim, out_ref.get(), 1.5f, 0.3f);
diff --git a/lite/kernels/opencl/slice_image_compute.cc b/lite/kernels/opencl/slice_image_compute.cc
index b9f1da22578a51c69b625af62cac1260f2650ba2..5bef5bfe09e62018b47bd081d9f264f49695bbca 100644
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -96,23 +96,20 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
                     static_cast<cl::size_type>(default_work_size.data()[1]),
                     static_cast<cl::size_type>(default_work_size.data()[2])};
 
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
         global_work_size,
         cl::NullRange,
         nullptr,
-        event_.get());
+        nullptr);
     CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
   }
 
  private:
   std::string kernel_func_name_{"slice"};
   std::string build_options_{"-DCL_DTYPE_half"};
   std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/slice_image_compute_test.cc b/lite/kernels/opencl/slice_image_compute_test.cc
index a931c3b9f4aacb13089c507a7fa023e7808fb196..b8e8e18af17ef6f104708d4ef0cee4db5f3ff5b6 100644
--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
@@ -84,7 +84,8 @@ TEST(slice_image2d_fp16, compute) {
   }
 
   LOG(INFO) << "prepare input";
-  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  std::unique_ptr<CLImageConverterDefault> default_converter(
+      new CLImageConverterDefault());
   DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
   LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
   std::vector<half_t> x_image_data(image_shape.production() * 4);  // 4 : RGBA
@@ -98,16 +99,7 @@ TEST(slice_image2d_fp16, compute) {
   LOG(INFO) << "out_image:" << out_image;
   kernel->Launch();
 
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
+  CLRuntime::Global()->command_queue().finish();
 
   std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
   slice_channel(input_v.data(), in_dim, out_ref.get(), 2, 5);
diff --git a/lite/kernels/x86/sequence_reshape_compute.cc b/lite/kernels/x86/sequence_reshape_compute.cc
index ccaeef27d7439b739b298f3b0756e2a2eddef2c1..22e10e94082ca3aef35d0e493e9854709986bcdc 100644
--- a/lite/kernels/x86/sequence_reshape_compute.cc
+++ b/lite/kernels/x86/sequence_reshape_compute.cc
@@ -24,3 +24,14 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sequence_reshape,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceReshapeFloatCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_reshape_compute.h b/lite/kernels/x86/sequence_reshape_compute.h
index d166f8bc3d80d9f87efb0315462daee3296f393f..bc5a1b0a533ac2a13ce316a991a0a4b19ce0c4ef 100644
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -62,8 +62,7 @@ class SequenceReshapeCompute
       }
     }
 
-    out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
-                                     out_width});
+    out->Resize(std::vector<int64_t>{in->numel() / out_width, out_width});
     auto* dst_ptr = out->template mutable_data<T>();
     auto size = in->numel() * sizeof(T);
     std::memcpy(dst_ptr, in->template data<T>(), size);
@@ -72,6 +71,52 @@ class SequenceReshapeCompute
   virtual ~SequenceReshapeCompute() = default;
 };
 
+template <typename T>
+class SequenceReshapeFloatCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceReshapeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
+    auto* in = param.x;
+    auto* out = param.output;
+    auto out_data = out->mutable_data<T>();
+    for (int i = 0; i < out->dims().production(); i++) {
+      out_data[i] = 0;
+    }
+    int out_width = param.new_dim;
+    const auto& in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    auto& in_lod = in->lod();
+    CHECK_EQ(in_lod.size(), 1UL);
+    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        CHECK_EQ(offset * out_width, seq_len * in_width);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+    out->Resize(std::vector<int64_t>{in->numel() / out_width, out_width});
+    auto* dst_ptr = out->mutable_data<T>();
+    auto size = in->numel() * sizeof(T);
+    std::memcpy(dst_ptr, in->data<T>(), size);
+  }
+
+  virtual ~SequenceReshapeFloatCompute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 07dc127695e3906719b45020a585966877bec868..7ded008387b7d7c92fb2ce6b18e73e1c1e51f29d 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -24,4 +24,6 @@ else()
   add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__embedding_with_eltwise_add_compute_xpu XPU extra SRCS __xpu__embedding_with_eltwise_add_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps})
 endif()
diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..376cdd0dc23426ede42ddac60e061727f73322e3
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  arg_ids_.reserve(param.Ids.size());
+  arg_tables_.reserve(param.Tables.size());
+  for (auto* table : param.Tables) {
+    auto& table_dims = table->dims();
+    CHECK_EQ(table_dims.size(), 2); /* shape like [table_len, embed_dim] */
+    table_lens_cpu_.push_back(table_dims[0]);
+  }
+  void* lens_ptr = nullptr;
+  size_t lens_size = table_lens_cpu_.size() * sizeof(int);
+  xpu_malloc(&lens_ptr, lens_size);
+  xpu_memcpy(lens_ptr, &table_lens_cpu_[0], lens_size, XPU_HOST_TO_DEVICE);
+  table_lens_guard_.reset(lens_ptr);
+}
+
+void XPUEmbeddingWithEltwiseAddCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  for (size_t i = 0; i < param.Ids.size(); ++i) {
+    arg_ids_[i] = param.Ids[i]->data<int64_t>();
+  }
+  for (size_t i = 0; i < param.Tables.size(); ++i) {
+    arg_tables_[i] = param.Tables[i]->data<float>();
+  }
+
+  auto& id_dims = param.Ids[0]->dims();
+  auto& table_dims = param.Tables[0]->dims();
+  int idx_len = id_dims[0] * id_dims[1];
+  int embed_dim = table_dims[1];
+  int emb_layer_num = param.Ids.size();
+  int r = xdnn::embedding_with_ewadd<float, int64_t, false, false>(
+      ctx.GetRawContext(),                        /* context */
+      embed_dim,                                  /* embed_dim */
+      idx_len,                                    /* idx_len */
+      emb_layer_num,                              /* emb_layer_num */
+      param.padding_idx,                          /* padding_idx */
+      &arg_tables_[0],                            /* tables */
+      &arg_ids_[0],                               /* indices */
+      static_cast<int*>(table_lens_guard_.get()), /* table_lens */
+      nullptr,                                    /* scale_after_emb */
+      nullptr,                                    /* scale_after_ewadd */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    __xpu__embedding_with_eltwise_add,
+    kXPU,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::xpu::XPUEmbeddingWithEltwiseAddCompute,
+    def)
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("Tables", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/arm/assign_compute.cc b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
similarity index 57%
rename from lite/kernels/arm/assign_compute.cc
rename to lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
index 8398634bb365c628b64e1ddd2b14984d5f2acb59..10ba6e0b5b76a1dbebfd633732f7c36e6ac7c954 100644
--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
@@ -12,29 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/assign_compute.h"
+#pragma once
+
+#include <memory>
 #include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/xpu/utils.h"  // XPUFreeDeleter
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace xpu {
+
+class XPUEmbeddingWithEltwiseAddCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUEmbeddingWithEltwiseAddParam;
+
+  void PrepareForRun() override;
 
-void AssignCompute::Run() {
-  auto& param = Param<param_t>();
-  param.Out->CopyDataFrom(*param.X);
-}
+  void Run() override;
 
-}  // namespace arm
+ private:
+  std::vector<const int64_t*> arg_ids_;
+  std::vector<const float*> arg_tables_;
+  std::unique_ptr<void, XPUFreeDeleter> table_lens_guard_;
+  std::vector<int> table_lens_cpu_;
+};
+
+}  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    assign, kARM, kAny, kNCHW, paddle::lite::kernels::arm::AssignCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d7ec01d36aa58f45954ede6f745d50e6c06df41
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__fc_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUFcCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto input_dims = param.input->dims();
+  param.in_mat_dims = input_dims.Flatten2D(param.in_num_col_dims);
+  int m = param.in_mat_dims[0];
+  int k = param.in_mat_dims[1];
+  int n = param.w->dims()[1];
+  const float* bias = param.bias ? param.bias->data<float>() : nullptr;
+  xdnn::Activation_t act_type = (param.activation_type == "relu")
+                                    ? xdnn::Activation_t::RELU
+                                    : xdnn::Activation_t::LINEAR;
+
+  int r = xdnn::fc_int16(
+      ctx.GetRawContext(),                                      /* context */
+      false,                                                    /* TransA */
+      param.transpose_w,                                        /* TransB */
+      m,                                                        /* m */
+      n,                                                        /* n */
+      k,                                                        /* k */
+      1.0f,                                                     /* alpha */
+      param.input->data<float>(),                               /* A */
+      reinterpret_cast<const int16_t*>(param.w->data<float>()), /* B */
+      param.w_max,                                              /* max_b */
+      0.0f,                                                     /* beta */
+      param.output->mutable_data<float>(TARGET(kXPU)),          /* C */
+      bias,                                                     /* bias */
+      act_type /* act_type */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__fc,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUFcCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__fc_compute.h b/lite/kernels/xpu/__xpu__fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..73295645ab50dbc1d341479a330ffcfa94dad3f4
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUFcParam;
+
+  virtual void Run();
+
+  virtual ~XPUFcCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc
index e9e5c19d25135ac5877e38eaf65829fefc500e07..90a6c70b49f39ce744f2a03eec41d79ddc768a19 100644
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/stack_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h
index 6f77cbb3a73bce2d5496f840b2a1f8e14313e776..1ba1d92dc9479cfd00c5e154df7b5476ffd9976c 100644
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
@@ -16,18 +16,14 @@
 
 #include <memory>
 #include <vector>
-#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/xpu/utils.h"  // XPUFreeDeleter
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
 
-struct XPUFreeDeleter {
-  void operator()(void* p) const { xpu_free(p); }
-};
-
 class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
   using param_t = operators::StackParam;
diff --git a/lite/kernels/xpu/utils.h b/lite/kernels/xpu/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d410cb1567d5c60aeb52b798d9f17c7f5692e096
--- /dev/null
+++ b/lite/kernels/xpu/utils.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/xpu/xpu_header_sitter.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+struct XPUFreeDeleter {
+  void operator()(void* p) const { xpu_free(p); }
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 8779783169bb776205211ca92eebc90f93934446..332d97ed7c265f199a628436534da80e6d0101d4 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -155,6 +155,8 @@ add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
 # Only for XPU
 add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__embedding_with_eltwise_add_op extra SRCS __xpu__embedding_with_eltwise_add_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c36e7b8157d5d781ad162515364290d8c9ef3be
--- /dev/null
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__embedding_with_eltwise_add_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUEmbeddingWithEltwiseAddOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.Ids.size() == param_.Tables.size());
+
+  auto& id_dims = param_.Ids[0]->dims();
+  auto& table_dims = param_.Tables[0]->dims();
+
+  int id_rank = id_dims.size();
+
+  CHECK_EQ_OR_FALSE(table_dims.size(), 2);
+  CHECK_EQ_OR_FALSE(id_dims[id_rank - 1], 1);
+
+  return true;
+}
+
+bool XPUEmbeddingWithEltwiseAddOp::InferShapeImpl() const {
+  auto& id_dims = param_.Ids[0]->dims();
+  auto& table_dims = param_.Tables[0]->dims();
+
+  auto out_dims = id_dims;
+  int id_rank = id_dims.size();
+  out_dims[id_rank - 1] = table_dims[1];
+
+  param_.Out->Resize(out_dims);
+  param_.Out->set_lod(param_.Ids[0]->lod());
+  return true;
+}
+
+bool XPUEmbeddingWithEltwiseAddOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                              lite::Scope* scope) {
+  param_.Out = scope->FindVar(op_desc.Output("Output").front())
+                   ->GetMutable<lite::Tensor>();
+
+  param_.Ids.clear();
+  for (auto& name : op_desc.Input("Ids")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.Ids.push_back(t);
+  }
+  param_.Tables.clear();
+  for (auto& name : op_desc.Input("Tables")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.Tables.push_back(t);
+  }
+
+  param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__embedding_with_eltwise_add,
+                 paddle::lite::operators::XPUEmbeddingWithEltwiseAddOp);
diff --git a/lite/operators/__xpu__embedding_with_eltwise_add_op.h b/lite/operators/__xpu__embedding_with_eltwise_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cfea5d3f1f8c5085f0d276c0ba420e03d2c75cb
--- /dev/null
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUEmbeddingWithEltwiseAddOp : public OpLite {
+ public:
+  XPUEmbeddingWithEltwiseAddOp() {}
+
+  explicit XPUEmbeddingWithEltwiseAddOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "EmbeddingWithEltwiseAdd"; }
+
+ private:
+  mutable XPUEmbeddingWithEltwiseAddParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__fc_op.cc b/lite/operators/__xpu__fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75a870065570afcdb0c0906458c5922499a33383
--- /dev/null
+++ b/lite/operators/__xpu__fc_op.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__fc_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUFcOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.w);
+  // bias is optional.
+
+  const auto input_dims = param_.input->dims();
+  const auto w_dims = param_.w->dims();
+  CHECK_EQ_OR_FALSE(w_dims.size(), 2UL);
+
+  int64_t w_dims_1 = w_dims[1];
+  if (param_.bias) {
+    const auto bias_dims = param_.bias->dims();
+    if (bias_dims.size() == 2) {
+      CHECK_EQ_OR_FALSE(bias_dims[0], 1);
+      CHECK_EQ_OR_FALSE(bias_dims[1], w_dims_1);
+    } else if (bias_dims.size() == 1) {
+      CHECK_EQ_OR_FALSE(bias_dims[0], w_dims_1);
+    }
+  }
+
+  CHECK_GT_OR_FALSE(input_dims.size(),
+                    static_cast<size_t>(param_.in_num_col_dims));
+  param_.in_mat_dims = input_dims.Flatten2D(param_.in_num_col_dims);
+  CHECK_EQ_OR_FALSE(param_.in_mat_dims[1], w_dims[0]);
+
+  return true;
+}
+
+bool XPUFcOp::InferShapeImpl() const {
+  const auto& input_dims = param_.input->dims();
+  const auto& w_dims = param_.w->dims();
+  int in_num_col_dims = param_.in_num_col_dims;
+  int64_t w_dims_1 = w_dims[1];
+
+  // Set output dims
+  std::vector<DDim::value_type> output_dims(in_num_col_dims + 1);
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    output_dims[i] = input_dims[i];
+  }
+  output_dims[in_num_col_dims] = w_dims_1;
+  param_.output->Resize(output_dims);
+
+  // share LoD
+  param_.output->set_lod(param_.input->lod());
+
+  return true;
+}
+
+bool XPUFcOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  auto input = op_desc.Input("Input").front();
+  auto W = op_desc.Input("W").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.w = scope->FindVar(W)->GetMutable<lite::Tensor>();
+  std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
+      input_arg_names.end()) {
+    auto bias_arguments = op_desc.Input("Bias");
+    if (bias_arguments.size() > 0) {
+      auto bias_var = scope->FindVar(bias_arguments.front());
+      if (bias_var != nullptr) {
+        param_.bias = bias_var->GetMutable<lite::Tensor>();
+      }
+    }
+  }
+  CHECK(scope->FindVar(out));
+  param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.in_num_col_dims = op_desc.GetAttr<int>("in_num_col_dims");
+  param_.w_max = op_desc.GetAttr<float>("w_max");
+
+  if (op_desc.HasAttr("activation_type")) {
+    param_.activation_type = op_desc.GetAttr<std::string>("activation_type");
+  }
+  if (op_desc.HasAttr("transpose_w")) {
+    param_.transpose_w = op_desc.GetAttr<bool>("transpose_w");
+  }
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__fc, paddle::lite::operators::XPUFcOp);
diff --git a/lite/operators/__xpu__fc_op.h b/lite/operators/__xpu__fc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee8d857335bc469f2de93dd704331709945a98bc
--- /dev/null
+++ b/lite/operators/__xpu__fc_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUFcOp : public OpLite {
+ public:
+  XPUFcOp() {}
+
+  explicit XPUFcOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUFc"; }
+
+ private:
+  mutable XPUFcParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc
index 25e8539d2e55a07a19d707713489d86f84aa64db..fe1e8db1f954af38041621d1d676cf16833357da 100644
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -27,20 +27,33 @@ bool AssignOpLite::CheckShape() const {
 }
 
 bool AssignOpLite::InferShapeImpl() const {
-  lite::DDim input_dims;
-  input_dims = param_.X->dims();
-  param_.Out->Resize(lite::DDim(input_dims));
+  if (param_.X != nullptr) {
+    param_.Out->Resize(param_.X->dims());
+  } else if (param_.X_array != nullptr) {
+    param_.Out_array->resize(param_.Out_array->size());
+  } else {
+    LOG(FATAL) << "x or x_array must be set.";
+  }
   return true;
 }
 
 // TODO(Superjomn) replace framework::OpDesc with a lite one.
 bool AssignOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto input = op_desc.Input("X").front();
-  auto out = op_desc.Output("Out").front();
+  auto x_name = op_desc.Input("X").front();
+  auto out_name = op_desc.Output("Out").front();
 
-  param_.X = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  CHECK(scope->FindVar(out));
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  auto x_var = scope->FindVar(x_name);
+  if (x_var->IsType<Tensor>()) {
+    param_.X = scope->FindTensor(x_name);
+    param_.Out = scope->FindMutableTensor(out_name);
+  } else if (x_var->IsType<std::vector<Tensor>>()) {
+    param_.X_array = x_var->GetMutable<std::vector<Tensor>>();
+    param_.Out_array =
+        scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
+  } else {
+    LOG(FATAL) << "X type for assign op is unsupported. Expected type is "
+                  "tensor or tensor_array.";
+  }
 
   return true;
 }
diff --git a/lite/operators/batch_norm_op.cc b/lite/operators/batch_norm_op.cc
index 67e037fba349e811f1faf991c84310b11ab7a13c..b043aad2aca05c7d42edec1960f5335b5fc91fc6 100644
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
@@ -73,6 +73,7 @@ bool BatchNormOp::InferShapeImpl() const {
 }
 
 bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
   param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
   param_.bias =
       scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
index c15bf292897006b3c6d5e67bcfaea5d0e590a82d..052b9cdca0a898185649cfdbddb933230e968b14 100644
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
@@ -66,6 +66,7 @@ bool ConcatOpLite::InferShapeImpl() const {
 
 // TODO(Superjomn) replace framework::OpDesc with a lite one.
 bool ConcatOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
   auto inputs = op_desc.Input("X");
   auto out = op_desc.Output("Out").front();
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index eab17fe6db0a59a9eb0eea0ab7344758a8232d15..49452fc44f1b114efc7eb2fb433000bebdb577a6 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -38,6 +38,7 @@ class ConvOpLite : public OpLite {
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
+    AttachParam(&param_);
     auto X = op_desc.Input("Input").front();
     auto Filter = op_desc.Input("Filter").front();
     auto Out = op_desc.Output("Output").front();
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index 3996c933407233538a62ae9e197978f799ce06b0..6cc41f0a66cfac4a0baa0153765a59766fa045f4 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -87,6 +87,8 @@ bool ElementwiseOp::InferShapeImpl() const {
 }
 
 bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  AttachParam(&param_);
+
   auto X_name = opdesc.Input("X").front();
   auto Y_name = opdesc.Input("Y").front();
   auto Out_name = opdesc.Output("Out").front();
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index d58a9e5b881048dd47340082fe9c94a618a7a5fb..d4032c5e8b98ff6d5763d2d06610d2e214ad90ca 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -69,6 +69,8 @@ bool FcOpLite::InferShapeImpl() const {
 }
 
 bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  AttachParam(&param_);
+
   auto input = op_desc.Input("Input").front();
   auto W = op_desc.Input("W").front();
   auto out = op_desc.Output("Out").front();
diff --git a/lite/operators/matmul_op.cc b/lite/operators/matmul_op.cc
index 04a0fc97d77a181e45e3e829010934e22381ae12..d3e2e963abbb68adf890a5ba42d3d187d3e611c4 100644
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
@@ -132,6 +132,7 @@ bool MatMulOpLite::InferShapeImpl() const {
 }
 
 bool MatMulOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
   CHECK(!op_desc.Input("X").empty());
   CHECK(!op_desc.Input("Y").empty());
   CHECK(!op_desc.Output("Out").empty());
diff --git a/lite/operators/mul_op.h b/lite/operators/mul_op.h
index 10a2e2efaa4db0e106e3c56c2f9b1cec9fb55ac4..74b64f11ae2ec75efa61a7799da49187c9e684ea 100644
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
@@ -38,6 +38,8 @@ class MulOpLite : public OpLite {
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    AttachParam(&param_);
+
     CHECK(!op_desc.Input("X").empty());
     CHECK(!op_desc.Input("Y").empty());
     CHECK(!op_desc.Output("Out").empty());
@@ -56,7 +58,6 @@ class MulOpLite : public OpLite {
     param_.output = var->GetMutable<Tensor>();
     param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
     param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
-
     return true;
   }
 
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 30ee736de494e1a93902d1252db2672aeef38f2e..d2ae0ceb20d40aac662fd3068be79fd266f9e984 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -35,8 +35,11 @@ namespace operators {
 
 struct ParamBase {
  public:
-  const std::vector<Tensor*>* input_tensor_ptrs() const { return nullptr; }
-  std::vector<Tensor*>* output_tensor_ptrs() { return nullptr; }
+  virtual ~ParamBase() {}
+  virtual const std::vector<const Tensor*>* input_tensor_ptrs() {
+    return nullptr;
+  }
+  virtual std::vector<Tensor*>* output_tensor_ptrs() { return nullptr; }
 
  protected:
   std::shared_ptr<std::vector<const Tensor*>> input_tensor_ptrs_cache_{nullptr};
@@ -108,15 +111,15 @@ struct FcParam : ParamBase {
   WITH_INT8_CONFIG
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({input}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -160,15 +163,15 @@ struct MulParam : ParamBase {
   WITH_INT8_CONFIG
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x, y}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -241,17 +244,20 @@ struct ScaleParam : ParamBase {
   float scale{1.};
   float bias{};
   bool bias_after_scale{true};
+  std::string activation_type{""};
+  bool fuse_relu{false};
+  float alpha{6.};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -265,15 +271,15 @@ struct SoftmaxParam : ParamBase {
   int axis{-1};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -292,15 +298,15 @@ struct ReshapeParam : ParamBase {
   bool inplace{false};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -314,8 +320,8 @@ struct ConcatParam : ParamBase {
   int axis{0};
   lite::Tensor* axis_tensor{};
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       std::vector<const Tensor*> vec;
       for (auto in : x) {
         vec.push_back(in);
@@ -325,8 +331,8 @@ struct ConcatParam : ParamBase {
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -337,7 +343,7 @@ struct ConcatParam : ParamBase {
 struct ActivationParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
-  lite_api::ActivationType active_type;
+  lite_api::ActivationType active_type{lite_api::ActivationType::kIndentity};
   bool has_active{false};
   float Leaky_relu_alpha{0};   // leaky_relu param
   float Relu_clipped_coef{6};  // relu_clipped param
@@ -406,15 +412,15 @@ struct ConvParam : ParamBase {
 
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -440,15 +446,15 @@ struct BatchNormParam : ParamBase {
   DataLayoutType data_layout{DATALAYOUT(kNCHW)};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({y}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -479,15 +485,15 @@ struct PoolParam : ParamBase {
   WITH_INT8_CONFIG
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -518,15 +524,15 @@ struct SplitParam : ParamBase {
   std::vector<int> sections;
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -544,15 +550,15 @@ struct TransposeParam : ParamBase {
   std::string data_format{"AnyLayout"};
   ///////////////////////////////////////////////////////////////////////////////////
   //  // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -571,15 +577,15 @@ struct ElementwiseParam : ParamBase {
   float y_input_scale{1.0};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X, Y}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -884,15 +890,15 @@ struct SequenceSoftmaxParam : ParamBase {
   lite::Tensor* Out{};
   ///////////////////////////////////////////////////////////////////////////////////
   //  // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -1135,15 +1141,15 @@ struct SliceParam : ParamBase {
   lite::Tensor* EndsTensor{nullptr};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -1197,15 +1203,15 @@ struct SqueezeParam : ParamBase {
   std::vector<int> axes{};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -1221,15 +1227,15 @@ struct UnsqueezeParam : ParamBase {
   std::vector<const lite::Tensor*> axes_tensor_vct{};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -1253,15 +1259,15 @@ struct MatMulParam : ParamBase {
   float alpha{1.0f};
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
-  const std::vector<const Tensor*>* input_tensor_ptrs() {
-    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+  const std::vector<const Tensor*>* input_tensor_ptrs() override {
+    if (!input_tensor_ptrs_cache_) {
       input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X, Y}));
     }
     return input_tensor_ptrs_cache_.get();
   }
   // get a vector of output tensors
-  const std::vector<Tensor*>* output_tensor_ptrs() {
-    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+  std::vector<Tensor*>* output_tensor_ptrs() override {
+    if (!output_tensor_ptrs_cache_) {
       output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
     }
     return output_tensor_ptrs_cache_.get();
@@ -1276,8 +1282,13 @@ struct GatherParam : ParamBase {
 
 /// ----------------------- assign operators -----------------------
 struct AssignParam : ParamBase {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
+  // for tensor
+  const lite::Tensor* X{nullptr};
+  lite::Tensor* Out{nullptr};
+
+  // for tensor_array
+  const std::vector<lite::Tensor>* X_array{nullptr};
+  std::vector<lite::Tensor>* Out_array{nullptr};
 };
 
 /// ----------------------- roi_align operators -----------------------
@@ -1483,6 +1494,26 @@ struct XPUMultiEncoderParam : ParamBase {
   std::string act_type{};
 };
 
+struct XPUEmbeddingWithEltwiseAddParam : ParamBase {
+  std::vector<lite::Tensor*> Ids;
+  std::vector<lite::Tensor*> Tables;
+  lite::Tensor* Out{};
+  int64_t padding_idx{-1};
+};
+
+struct XPUFcParam : ParamBase {
+  lite::Tensor* input{nullptr};
+  lite::Tensor* w{nullptr};
+  lite::Tensor* bias{nullptr};
+  lite::Tensor* output{nullptr};
+
+  int in_num_col_dims{1};
+  lite::DDim in_mat_dims;
+  float w_max{0.0f};
+  bool transpose_w{true};
+  std::string activation_type{""};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index 97f4a8a0083550fdcb0bc2d011e5e33d2d02011d..9c29f9597cde534ba158aa5d1b055c3d21a70474 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -41,6 +41,7 @@ class PoolOpLite : public OpLite {
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    AttachParam(&param_);
     auto x = op_desc.Input("X").front();
     auto out = op_desc.Output("Out").front();
 
diff --git a/lite/operators/read_from_array_op.cc b/lite/operators/read_from_array_op.cc
index 495fd752c90da528e474b7aa726c65fd6e66c123..6a264f9a75485c36268851c54369d3a63bcd1855 100644
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
@@ -26,12 +26,7 @@ bool ReadFromArrayOp::CheckShape() const {
   return true;
 }
 
-bool ReadFromArrayOp::InferShapeImpl() const {
-  int id = param_.I->data<int64_t>()[0];
-  auto out_dims = (*param_.X)[id].dims();
-  param_.Out->Resize(out_dims);
-  return true;
-}
+bool ReadFromArrayOp::InferShapeImpl() const { return true; }
 
 bool ReadFromArrayOp::AttachImpl(const cpp::OpDesc &opdesc,
                                  lite::Scope *scope) {
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
index 32bc91a3a0b9b852024e2e0f2ea36585e2a29892..93f4ad9048779d1ea6861a273ff09c73cbd89281 100644
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -56,6 +56,7 @@ bool ReshapeOp::InferShapeImpl() const {
 }
 
 bool ReshapeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
   param_.x =
       scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
   param_.output =
diff --git a/lite/operators/scale_op.cc b/lite/operators/scale_op.cc
index 3236277187462dd1185e698e5cb8fe919fe20b97..85e29bef7882113614d15e171ab80b966da4ca50 100644
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
@@ -30,6 +30,7 @@ bool ScaleOp::InferShapeImpl() const {
 }
 
 bool ScaleOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
   auto x = op_desc.Input("X").front();
   auto output = op_desc.Output("Out").front();
   param_.x = scope->FindVar(x)->GetMutable<Tensor>();
@@ -37,6 +38,20 @@ bool ScaleOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   param_.scale = op_desc.GetAttr<float>("scale");
   param_.bias = op_desc.GetAttr<float>("bias");
   param_.bias_after_scale = op_desc.GetAttr<bool>("bias_after_scale");
+  if (op_desc.HasAttr("activation_type")) {
+    auto act_type = op_desc.GetAttr<std::string>("activation_type");
+    param_.activation_type = act_type;
+    if (act_type == "relu") {
+      param_.fuse_relu = true;
+    } else if (act_type == "relu6") {
+      param_.alpha = op_desc.GetAttr<float>("alpha");  // 6.f
+    } else if (act_type == "leaky_relu") {
+      param_.alpha = op_desc.GetAttr<float>("alpha");
+    } else {
+      CHECK(false)
+          << "The fused conv only supports fuse with relu and leaky relu";
+    }
+  }
   CHECK(param_.x);
   CHECK(param_.output);
   return true;
diff --git a/lite/operators/sequence_softmax_op.cc b/lite/operators/sequence_softmax_op.cc
index eb1821129d8b036a252fb36ab69094c8a58cce95..c13c4cc7392a931e0066c8a177f2c2ca56bc76f4 100644
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
@@ -34,6 +34,7 @@ bool SequenceSoftmaxOp::InferShapeImpl() const {
 
 bool SequenceSoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc,
                                    lite::Scope *scope) {
+  AttachParam(&param_);
   param_.X =
       scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
   param_.Out =
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index ecbcc5c2c5925d320c0334889634e57ed894695f..c18fc989411b8e074f562af0f1685810872151c6 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -87,6 +87,7 @@ bool SliceOp::InferShapeImpl() const {
 }
 
 bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
   param_.X =
       scope->FindVar(opdesc.Input("Input").front())->GetMutable<lite::Tensor>();
   param_.Out =
diff --git a/lite/operators/softmax_op.cc b/lite/operators/softmax_op.cc
index 000953007c27e37bc05d85d810880f6ccd7728ce..e95e355bda428d724e3b89ee80fc01f592032765 100644
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
@@ -38,6 +38,8 @@ bool SoftmaxOp::InferShapeImpl() const {
 }
 
 bool SoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
+
   param_.x = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
   param_.output =
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 14cff7d692e3aaa37d95233931760f37c31e4526..ed913a72bc1174f7919dc677b78059771146391a 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -75,6 +75,7 @@ bool SplitOp::InferShapeImpl() const {
 }
 
 bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
   param_.axis = opdesc.GetAttr<int>("axis");
   param_.num = opdesc.GetAttr<int>("num");
   param_.sections = opdesc.GetAttr<std::vector<int>>("sections");
diff --git a/lite/operators/squeeze_op.cc b/lite/operators/squeeze_op.cc
index c34ad06debb0c4bb99d083bc7938ea26b2dcac9f..8dada8fed06de4dc44149c0fd7583fe646cc2dd2 100644
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
@@ -84,6 +84,7 @@ bool SqueezeOp::InferShapeImpl() const {
 }
 
 bool SqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
   auto x_var = scope->FindVar(opdesc.Input("X").front());
   auto output_var = scope->FindVar(opdesc.Output("Out").front());
   CHECK(x_var);
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index 40780346d038c875a2eb96b11aff9d1c2a578a2f..fe40bf6fa2f84ce7c999b41435aed00cd6555887 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -70,6 +70,7 @@ bool TransposeOp::InferShapeImpl() const {
 }
 
 bool TransposeOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
   auto x = op_desc.Input("X").front();
   auto out = op_desc.Output("Out").front();
 
diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc
index 0a7487d34eeb6fe149f956e2f48bdb411a690f14..23865aaabbb6c7617b21fffd4cddea1e358f302f 100644
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
@@ -89,6 +89,7 @@ bool UnsqueezeOp::InferShapeImpl() const {
 }
 
 bool UnsqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
   auto x_var = scope->FindVar(opdesc.Input("X").front());
   auto output_var = scope->FindVar(opdesc.Output("Out").front());
   CHECK(x_var);
diff --git a/lite/operators/write_to_array_op.cc b/lite/operators/write_to_array_op.cc
index d2cf7b4f94513d1058c3b4f4de1ec70c8c244b7e..8d2c4d6b5c16e12415ff639bfe3b4b926e37f875 100644
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
@@ -26,13 +26,7 @@ bool WriteToArrayOp::CheckShape() const {
   return true;
 }
 
-bool WriteToArrayOp::InferShapeImpl() const {
-  int id = param_.I->data<int64_t>()[0];
-  if (param_.Out->size() < id + 1) {
-    param_.Out->resize(id + 1);
-  }
-  return true;
-}
+bool WriteToArrayOp::InferShapeImpl() const { return true; }
 
 bool WriteToArrayOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   auto inputs = opdesc.Input("X").front();
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index c74b86cb5cca3b7d08d14507821ff103c796ca07..03f0de291e80d821af5704727dbd30b10d2ca453 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -20,7 +20,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LIT
     #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/assign_compute_test.cc b/lite/tests/kernels/assign_compute_test.cc
index d757b906083f1ae63ea94ea5e092f1eb3e77a732..07bc9cf6ed08d9b62d5d8025defd2d44cd24fc46 100644
--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
@@ -69,7 +69,7 @@ void TestAssign(const Place& place) {
 TEST(Assign, precision) {
   Place place;
 #ifdef LITE_WITH_ARM
-  place = TARGET(kARM);
+  place = TARGET(kHost);
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/compare_compute_test.cc b/lite/tests/kernels/compare_compute_test.cc
index fbea52ab0d160982c1f5dd8385329a822c20e8e9..c46718f8bf672dc4460b59401c27a5b47f771daa 100644
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
@@ -78,12 +78,12 @@ class CompareComputeTester : public arena::TestCase {
     auto* out = scope->NewTensor(out_);
     CHECK(out);
     out->Resize(x_dims_);
-    auto* out_data = out->mutable_data<bool>();
+    auto* out_data = out->template mutable_data<bool>();
     auto axis = axis_;
     auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<T>();
+    const auto* x_data = x->template data<T>();
     auto* y = scope->FindTensor(y_);
-    auto* y_data_in = y->data<T>();
+    auto* y_data_in = y->template data<T>();
 
     using CompareFunc = Functor<T>;
     if (x_dims_.size() == y_dims_.size()) {
diff --git a/lite/tests/kernels/logical_compute_test.cc b/lite/tests/kernels/logical_compute_test.cc
index e53ac15f9be9949fd2c75d430a109600817624d8..4e379c0a9c3e07119388d9c835ebd4bdef1570b3 100644
--- a/lite/tests/kernels/logical_compute_test.cc
+++ b/lite/tests/kernels/logical_compute_test.cc
@@ -20,86 +20,118 @@
 namespace paddle {
 namespace lite {
 
-bool _logical_xor_func(const bool& a, const bool& b) {
-  return (a || b) && !(a && b);
-}
-bool _logical_and_func(const bool& a, const bool& b) { return (a && b); }
-template <bool (*T)(const bool&, const bool&)>
-class LogicalXorTester : public arena::TestCase {
+struct _logical_and_func {
+  inline bool operator()(const bool& a, const bool& b) const { return a && b; }
+};
+
+struct _logical_or_func {
+  inline bool operator()(const bool& a, const bool& b) const { return a || b; }
+};
+
+struct _logical_xor_func {
+  inline bool operator()(const bool& a, const bool& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+
+struct _logical_not_func {
+  inline bool operator()(const bool& a, const bool& b) const { return !a; }
+};
+
+template <class Functor>
+class LogicalTester : public arena::TestCase {
  protected:
-  std::string input_x_ = "x";
-  std::string input_y_ = "y";
-  std::string output_ = "out";
-  DDim dims_{{3, 5, 4, 4}};
+  std::string op_type_ = "logical_xor";
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string out_ = "out";
+  DDim dims_{{2, 3, 4, 5}};
 
  public:
-  LogicalXorTester(const Place& place, const std::string& alias, DDim dims)
-      : TestCase(place, alias), dims_(dims) {}
+  LogicalTester(const Place& place,
+                const std::string& alias,
+                const std::string& op_type)
+      : TestCase(place, alias), op_type_(op_type) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
+    auto* x = scope->FindTensor(x_);
+    const bool* x_data = x->template data<bool>();
+    const Tensor* y = nullptr;
+    const bool* y_data = nullptr;
+    if (op_type_ != "logical_not") {
+      y = scope->FindTensor(y_);
+      y_data = y->template data<bool>();
+    }
+
+    auto* out = scope->NewTensor(out_);
     out->Resize(dims_);
-    bool* out_data = out->mutable_data<bool>();
-    auto* x = scope->FindTensor(input_x_);
-    const bool* x_data = x->data<bool>();
-    auto* y = scope->FindTensor(input_y_);
-    const bool* y_data = y->data<bool>();
+    bool* out_data = out->template mutable_data<bool>();
     for (int i = 0; i < dims_.production(); i++) {
-      // out_data[i] = (x_data[i] || y_data[i]) && !((x_data[i] && y_data[i]));
-      out_data[i] = T(x_data[i], y_data[i]);
+      bool y_tmp = (y_data == nullptr) ? true : y_data[i];
+      out_data[i] = Functor()(x_data[i], y_tmp);
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("logical_xor");
-    op_desc->SetInput("X", {input_x_});
-    op_desc->SetInput("Y", {input_y_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    if (op_type_ != "logical_not") {
+      op_desc->SetInput("Y", {y_});
+    }
+    op_desc->SetOutput("Out", {out_});
   }
 
   void PrepareData() override {
-    // std::vector<bool> data(dims_.production());
-    // std::vector<char> datay(dims_.production());
-    bool* data;
-    bool* datay;
-    data = reinterpret_cast<bool*>(malloc(dims_.production() * sizeof(bool)));
-    datay = reinterpret_cast<bool*>(malloc(dims_.production() * sizeof(bool)));
-    LOG(INFO) << "dims_.production()"
-              << ":::" << dims_.production();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = 1;
-      datay[i] = 1;
+    bool* dx = new bool[dims_.production()];
+    for (int64_t i = 0; i < dims_.production(); i++) {
+      dx[i] = (i % 3 == 0);
     }
+    SetCommonTensor(x_, dims_, dx);
+    delete dx;
 
-    SetCommonTensor(input_x_, dims_, data);
-    SetCommonTensor(input_y_, dims_, datay);
+    if (op_type_ != "logical_not") {
+      bool* dy = new bool[dims_.production()];
+      for (int64_t i = 0; i < dims_.production(); i++) {
+        dy[i] = (i % 2 == 0);
+      }
+      SetCommonTensor(y_, dims_, dy);
+      delete dy;
+    }
   }
 };
 
-void test_logical(Place place) {
-  DDimLite dims{{3, 5, 4, 4}};
-  std::unique_ptr<arena::TestCase> logical_xor_tester(
-      new LogicalXorTester<_logical_xor_func>(place, "def", dims));
-  arena::Arena arena_xor(std::move(logical_xor_tester), place, 1);
+void TestLogical(Place place, float abs_error) {
+  std::unique_ptr<arena::TestCase> logical_and_tester(
+      new LogicalTester<_logical_and_func>(place, "def", "logical_and"));
+  arena::Arena arena_and(std::move(logical_and_tester), place, abs_error);
+  arena_and.TestPrecision();
 
-  arena_xor.TestPrecision();
+  std::unique_ptr<arena::TestCase> logical_or_tester(
+      new LogicalTester<_logical_or_func>(place, "def", "logical_or"));
+  arena::Arena arena_or(std::move(logical_or_tester), place, abs_error);
+  arena_or.TestPrecision();
 
-  std::unique_ptr<arena::TestCase> logical_and_tester(
-      new LogicalXorTester<_logical_and_func>(place, "def", dims));
-  arena::Arena arena_and(std::move(logical_and_tester), place, 1);
+  std::unique_ptr<arena::TestCase> logical_xor_tester(
+      new LogicalTester<_logical_xor_func>(place, "def", "logical_xor"));
+  arena::Arena arena_xor(std::move(logical_xor_tester), place, abs_error);
+  arena_xor.TestPrecision();
 
-  arena_and.TestPrecision();
+  std::unique_ptr<arena::TestCase> logical_not_tester(
+      new LogicalTester<_logical_not_func>(place, "def", "logical_not"));
+  arena::Arena arena_not(std::move(logical_not_tester), place, abs_error);
+  arena_not.TestPrecision();
 }
+
 TEST(Logical, precision) {
-// #ifdef LITE_WITH_X86
-// //   Place place(TARGET(kX86));
-// // #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_logical(place);
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#else
+  return;
 #endif
+
+  TestLogical(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index c4f9277d86128df808351007dda8d300da15a526..988077c6c319d5bcc8e50d6c8e5544331a86fe45 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -65,12 +65,12 @@ class LookupTableComputeTest : public arena::TestCase {
     out->Resize(out_dims);
     out->set_lod(ids->lod());
 
-    auto ids_data = ids->data<T>();
+    auto ids_data = ids->template data<T>();
     auto ids_size = ids_dims.production();
-    auto w_data = w->data<float>();
+    auto w_data = w->template data<float>();
     auto w_rows = w_dims[0];
     auto w_cols = w_dims[1];
-    auto out_data = out->mutable_data<float>();
+    auto out_data = out->template mutable_data<float>();
 
     for (int64_t i = 0; i < ids_size; i++) {
       auto id = ids_data[i];
diff --git a/lite/tests/kernels/read_from_array_compute_test.cc b/lite/tests/kernels/read_from_array_compute_test.cc
index cd3596ff56ec37dc6d0dbe78a17d5e678222fbf4..0a4b095b533e01d048ebb452e254caa6ccd67214 100644
--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ b/lite/tests/kernels/read_from_array_compute_test.cc
@@ -88,7 +88,7 @@ TEST(ReadFromArray, precision) {
   Place place;
   float abs_error = 1e-5;
 #ifdef LITE_WITH_ARM
-  place = TARGET(kARM);
+  place = TARGET(kHost);
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/topk_compute_test.cc b/lite/tests/kernels/topk_compute_test.cc
index 699dd000fd49080e7b722754c6c515fb2b77a40c..c54d297518cb0438e1851869b58ac060114d6281 100644
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
@@ -50,11 +50,11 @@ class TopkComputeTester : public arena::TestCase {
     out_dims[out_dims.size() - 1] = k_;
     out_val->Resize(out_dims);
     out_ind->Resize(out_dims);
-    auto* out_val_data = out_val->mutable_data<T1>();
-    auto* out_ind_data = out_ind->mutable_data<T2>();
+    auto* out_val_data = out_val->template mutable_data<T1>();
+    auto* out_ind_data = out_ind->template mutable_data<T2>();
 
     auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<T1>();
+    const auto* x_data = x->template data<T1>();
     int m = out_dims.production() / k_;
     int n = x_dims_[x_dims_.size() - 1];
 
diff --git a/lite/tests/kernels/write_to_array_compute_test.cc b/lite/tests/kernels/write_to_array_compute_test.cc
index 233403171af2629f63983e9c5318d3e1f2a85a98..b8110a2e2c8bba9e2dd343ffe91ad595381dc060 100644
--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ b/lite/tests/kernels/write_to_array_compute_test.cc
@@ -85,7 +85,7 @@ TEST(WriteToArray, precision) {
   Place place;
   float abs_error = 1e-5;
 #ifdef LITE_WITH_ARM
-  place = TARGET(kARM);
+  place = TARGET(kHost);
 #else
   return;
 #endif
diff --git a/build.bat b/lite/tools/build.bat
similarity index 91%
rename from build.bat
rename to lite/tools/build.bat
index 4510ee774ed9a3b9fe5a9d55b405b1dae39c3f45..a68cae1a64e75dd4485100b7cc2856f688982041 100644
--- a/build.bat
+++ b/lite/tools/build.bat
@@ -2,7 +2,7 @@
 setlocal
 setlocal enabledelayedexpansion
 
-set source_path=%~dp0
+set source_path=%~dp0\\..\\..\\
 rem  global variables
 set BUILD_EXTRA=OFF
 set BUILD_JAVA=ON
@@ -92,16 +92,16 @@ goto:eof
         ) else (
                echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
                call:rm_rebuild_dir "%workspace%\third-party"
-               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+               !python_path! %workspace%\lite\tools\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
         )
     ) else (
         if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
             echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
             call:download_third_party
-            !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+            !python_path! %workspace%\lite\tools\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
         ) else (
             echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
-               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+               !python_path! %workspace%\lite\tools\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
         )
 
     )
@@ -131,4 +131,4 @@ if "%tmp_var:~-1%"==" " (
     set "tmp_var=%tmp_var:~0,-1%"
     goto remove_left_space
 )
-goto:eof
\ No newline at end of file
+goto:eof
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index bd5d2d37aa7b80dd01faebd8a8d88ba0135e37a4..115834460765945f16881f0c5999411b485a1b31 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -32,6 +32,7 @@ APU_DDK_ROOT="$(pwd)/apu_sdk_lib/"
 BUILD_RKNPU=OFF
 RKNPU_DDK_ROOT="$(pwd)/rknpu/"
 LITE_WITH_ARM_LANG=OFF
+PYTHON_EXECUTABLE_OPTION=""
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 
@@ -349,6 +350,7 @@ function make_cuda {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
+            -DLITE_WITH_STATIC_CUDA=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
             -DLITE_WITH_XPU=$BUILD_XPU \
@@ -387,7 +389,8 @@ function make_x86 {
             -DLITE_WITH_XPU=$BUILD_XPU \
             -DLITE_WITH_XTCL=$BUILD_XTCL \
             -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=Release \
+            $PYTHON_EXECUTABLE_OPTION
 
   make publish_inference -j$NUM_PROC
   cd -
@@ -481,7 +484,7 @@ function main {
             --build_dir=*)
                 BUILD_DIR="${i#*=}"
                 shift
-		            ;;
+		;;
             --opt_model_dir=*)
                 OPTMODEL_DIR="${i#*=}"
                 shift
@@ -514,6 +517,10 @@ function main {
                 XPU_SDK_ROOT="${i#*=}"
                 shift
                 ;;
+            --python_executable=*)
+                PYTHON_EXECUTABLE_OPTION="-DPYTHON_EXECUTABLE=${i#*=}"
+                shift
+                ;;
             --build_apu=*)
                 BUILD_APU="${i#*=}"
                 shift
diff --git a/lite/tools/build_android.sh b/lite/tools/build_android.sh
new file mode 100755
index 0000000000000000000000000000000000000000..db9ec400b16c56c2c62f152954eef5c89b3cf514
--- /dev/null
+++ b/lite/tools/build_android.sh
@@ -0,0 +1,363 @@
+#!/bin/bash
+set +x
+#####################################################################################################
+# 1. global variables, you can change them according to your requirements
+#####################################################################################################
+# armv7 or armv8, default armv8.
+ARM_ABI=armv8
+# c++_static or c++_shared, default c++_static.
+ANDROID_STL=c++_static
+# gcc or clang, default gcc.
+TOOLCHAIN=gcc
+# ON or OFF, default OFF.
+WITH_EXTRA=OFF
+# ON or OFF, default ON. 
+WITH_JAVA=ON
+# controls whether to compile cv functions into lib, default is OFF.
+WITH_CV=OFF
+# controls whether to hide log information, default is ON.
+SHUTDOWN_LOG=ON
+# options of striping lib according to input model.
+OPTMODEL_DIR=""
+WITH_STRIP=OFF
+# options of compiling NPU lib.
+WITH_HUAWEI_KIRIN_NPU=OFF
+HUAWEI_KIRIN_NPU_SDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
+# options of compiling OPENCL lib.
+WITH_OPENCL=OFF
+# options of adding training ops
+WITH_TRAIN=OFF
+# num of threads used during compiling..
+readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
+#####################################################################################################
+
+
+
+
+#####################################################################################################
+# 2. local variables, these variables should not be changed.
+#####################################################################################################
+# url that stores third-party zip file to accelerate third-paty lib installation
+readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+# absolute path of Paddle-Lite.
+readonly workspace=$PWD/$(dirname $0)/../../
+# basic options for android compiling.
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                               -DLITE_WITH_ARM=ON \
+                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+                               -DLITE_WITH_X86=OFF \
+                               -DWITH_TESTING=OFF \
+                               -DARM_TARGET_OS=android"
+# on mac environment, we should expand the maximum file num to compile successfully
+os_name=`uname -s`
+if [ ${os_name} == "Darwin" ]; then
+   ulimit -n 1024
+fi
+#####################################################################################################
+
+
+
+
+
+####################################################################################################
+# 3. functions of prepare workspace before compiling
+####################################################################################################
+
+# 3.1 generate `__generated_code__.cc`, which is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    local root_dir=$1
+    local build_dir=$2
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code
+    mkdir -p ${GEN_CODE_PATH_PREFIX}
+    touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=$build_dir/lite/tools/debug
+    mkdir -p ${DEBUG_TOOL_PATH_PREFIX}
+    cp $root_dir/lite/tools/debug/analysis_tool.py ${DEBUG_TOOL_PATH_PREFIX}/
+}
+
+
+# 3.2 prepare source code of opencl lib
+# here we bundle all cl files into a cc file to bundle all opencl kernels into a single lib
+function prepare_opencl_source_code {
+    local root_dir=$1
+    local build_dir=$2
+    # in build directory
+    # Prepare opencl_kernels_source.cc file
+    GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl
+    rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel
+    mkdir -p ${GEN_CODE_PATH_OPENCL}
+    touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc 
+}
+
+# 3.3 prepare third_party libraries for compiling
+# here we store third_party libraries into Paddle-Lite/third-party
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        git submodule update --init --recursive
+    fi
+}
+####################################################################################################
+
+
+
+
+
+####################################################################################################
+# 4. compiling functions
+####################################################################################################
+
+# 4.1 function of tiny_publish compiling
+# here we only compile light_api lib
+function make_tiny_publish_so {
+  build_dir=$workspace/build.lite.android.$ARM_ABI.$TOOLCHAIN
+  if [ "${WITH_OPENCL}" == "ON" ]; then
+      build_dir=${build_dir}.opencl
+  fi
+  if [ "${WITH_npu}" == "ON" ]; then
+      build_dir=${build_dir}.npu
+  fi
+
+
+  if [ -d $build_dir ]
+  then
+      rm -rf $build_dir
+  fi
+  mkdir -p $build_dir
+  cd $build_dir
+
+  if [ "${WITH_OPENCL}" == "ON" ]; then
+      prepare_opencl_source_code $workspace $build_dir
+  fi
+
+
+  local cmake_mutable_options="
+      -DLITE_BUILD_EXTRA=$WITH_EXTRA \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
+      -DLITE_BUILD_TAILOR=$WITH_STRIP \
+      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
+      -DLITE_WITH_JAVA=$WITH_JAVA \
+      -DLITE_WITH_CV=$WITH_CV \
+      -DLITE_WITH_NPU=$WITH_HUAWEI_KIRIN_NPU \
+      -DNPU_DDK_ROOT=$HUAWEI_KIRIN_NPU_SDK_ROOT \
+      -DLITE_WITH_OPENCL=$WITH_OPENCL \
+      -DARM_TARGET_ARCH_ABI=$ARM_ABI \
+      -DARM_TARGET_LANG=$TOOLCHAIN \
+      -DANDROID_STL_TYPE=$ANDROID_STL"
+
+  cmake $workspace \
+      ${CMAKE_COMMON_OPTIONS} \
+      ${cmake_mutable_options}  \
+      -DLITE_ON_TINY_PUBLISH=ON 
+
+  # todo: third_party of opencl should be moved into git submodule and cmake later
+  if [ "${WITH_OPENCL}" == "ON" ]; then
+      make opencl_clhpp -j$NUM_PROC 
+  fi
+
+  make publish_inference -j$NUM_PROC
+  cd - > /dev/null
+}
+
+# 4.2 function of full_publish compiling
+# here we compile both light_api lib and full_api lib
+function make_full_publish_so {
+
+  prepare_thirdparty
+
+  build_directory=$workspace/build.lite.android.$ARM_ABI.$ARM_LANG
+
+  if [ -d $build_directory ]
+  then
+      rm -rf $build_directory
+  fi
+  mkdir -p $build_directory
+  cd $build_directory
+
+  prepare_workspace $workspace $build_directory
+
+  if [ "${WITH_OPENCL}" == "ON" ]; then
+      prepare_opencl_source_code $workspace $build_dir
+  fi
+
+  local cmake_mutable_options="
+      -DLITE_BUILD_EXTRA=$WITH_EXTRA \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
+      -DLITE_BUILD_TAILOR=$WITH_STRIP \
+      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
+      -DLITE_WITH_JAVA=$WITH_JAVA \
+      -DLITE_WITH_CV=$WITH_CV \
+      -DLITE_WITH_NPU=$WITH_HUAWEI_KIRIN_NPU \
+      -DNPU_DDK_ROOT=$HUAWEI_KIRIN_NPU_SDK_ROOT \
+      -DLITE_WITH_OPENCL=$WITH_OPENCL \
+      -DARM_TARGET_ARCH_ABI=$ARM_ABI \
+      -DARM_TARGET_LANG=$ARM_LANG \
+      -DLITE_WITH_TRAIN=$WITH_TRAIN \
+      -DANDROID_STL_TYPE=$ANDROID_STL"
+
+  cmake $workspace \
+      ${CMAKE_COMMON_OPTIONS} \
+      ${cmake_mutable_options}
+
+  # todo: third_party of opencl should be moved into git submodule and cmake later
+  if [ "${WITH_OPENCL}" == "ON" ]; then
+      make opencl_clhpp -j$NUM_PROC
+  fi
+
+  make publish_inference -j$NUM_PROC
+  cd - > /dev/null
+}
+
+
+# 4.3 function of print help information
+function print_usage {
+    echo "----------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "| Methods of compiling Padddle-Lite Android library:                                                                                   |"
+    echo "----------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "|  compile android library: (armv8, gcc, c++_static)                                                                                   |"
+    echo -e "|     ./lite/tools/build_android.sh                                                                                                    |"
+    echo -e "|  print help information:                                                                                                             |"
+    echo -e "|     ./lite/tools/build_android.sh help                                                                                               |"
+    echo -e "|                                                                                                                                      |"
+    echo -e "|  optional argument:                                                                                                                  |"
+    echo -e "|     --arm_abi: (armv8|armv7), default is armv8                                                                                       |"
+    echo -e "|     --toolchain: (gcc|clang), defalut is gcc                                                                                         |"
+    echo -e "|     --android_stl: (c++_static|c++_shared|gnu_static|gnu_shared), default is c++_static                                              |"
+    echo -e "|     --with_java: (OFF|ON); controls whether to publish java api lib, default is ON                                                   |"
+    echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                           |"
+    echo -e "|     --shutdown_log: (OFF|ON); controls whether to hide log information, default is ON                                                |"
+    echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)  |"
+    echo -e "|                                                                                                                                      |"
+    echo -e "|  arguments of striping lib according to input model:(armv8, gcc, c++_static)                                                         |"
+    echo -e "|     ./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir                                              |"
+    echo -e "|     --with_strip: (OFF|ON); controls whether to strip lib accrding to input model, default is OFF                                    |"
+    echo -e "|     --opt_model_dir: (absolute path to optimized model dir) required when compiling striped library                                  |"
+    echo -e "|  detailed information about striping lib:  https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html           |"
+    echo -e "|                                                                                                                                      |"
+    echo -e "|  arguments of npu library compiling:(armv8, gcc, c++_static)                                                                         |"
+    echo -e "|     ./lite/tools/build_android.sh --with_huawei_kirin_npu=ON --huawei_kirin_npu_sdk_root=YourNpuSdkPath                              |"
+    echo -e "|     --with_huawei_kirin_npu: (OFF|ON); controls whether to compile lib for huawei_kirin_npu, default is OFF                          |"
+    echo -e "|     --huawei_kirin_npu_sdk_root: (path to huawei HiAi DDK file) required when compiling npu library                                  |"
+    echo -e "|             you can download huawei HiAi DDK from:  https://developer.huawei.com/consumer/cn/hiai/                                   |"
+    echo -e "|  detailed information about Paddle-Lite NPU:  https://paddle-lite.readthedocs.io/zh/latest/demo_guides/npu.html                      |"
+    echo -e "|                                                                                                                                      |"
+    echo -e "|  arguments of opencl library compiling:(armv8, gcc, c++_static)                                                                      |"
+    echo -e "|     ./lite/tools/build_android.sh --with_opencl=ON                                                                                   |"
+    echo -e "|     --with_opencl: (OFF|ON); controls whether to compile lib for opencl, default is OFF                                              |"
+    echo "----------------------------------------------------------------------------------------------------------------------------------------"
+    echo
+}
+
+####################################################################################################
+
+
+####################################################################################################
+# 5. main functions: choose compiling method according to input argument
+####################################################################################################
+function main {
+    if [ -z "$1" ]; then
+        # compiling result contains light_api lib only, recommanded.
+        make_tiny_publish_so $ARM_ABI $TOOLCHAIN $ANDROID_STL
+    fi
+
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            # armv7 or armv8, default armv8
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            # gcc or clang, default gcc
+            --toolchain=*)
+                TOOLCHAIN="${i#*=}"
+                shift
+                ;;
+            # c++_static or c++_shared, default c++_static
+            --android_stl=*)
+                ANDROID_STL="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_extra=*)
+                WITH_EXTRA="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_cv=*)
+                WITH_CV="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default ON
+            --with_java=*)
+                WITH_JAVA="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
+            --with_strip=*)
+                WITH_STRIP="${i#*=}"
+                shift
+                ;;
+            # string, absolute path to optimized model dir
+            --opt_model_dir=*)
+                OPTMODEL_DIR="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default ON
+            --shutdown_log=*)
+                SHUTDOWN_LOG="${i#*=}"
+                shift
+                ;;
+            # compiling lib which can operate on opencl and cpu.
+            --with_opencl=*)
+                WITH_OPENCL="${i#*=}"
+                shift
+                ;;
+            # compiling lib which can operate on huawei npu.
+            --with_huawei_kirin_npu=*)
+                WITH_HUAWEI_KIRIN_NPU="${i#*=}"
+                shift
+                ;;
+            --huawei_kirin_npu_sdk_root=*)
+                HUAWEI_KIRIN_NPU_SDK_ROOT="${i#*=}"
+                shift
+                ;;
+            # compiling result contains both light_api and cxx_api lib.
+            full_publish)
+                make_full_publish_so
+                exit 0
+                ;;
+            # compiling lib with training ops.
+            --with_train=*)
+                WITH_TRAIN="${i#*=}"
+                shift
+                ;;
+            help)
+            # print help info
+                print_usage
+                exit 0
+                ;;
+            *)
+                # unknown option
+                echo "Error: unsupported argument \"${i#*=}\""
+                print_usage
+                exit 1
+                ;;
+        esac
+        # compiling result contains light_api lib only, recommanded.
+        make_tiny_publish_so
+    done
+}
+
+main $@
diff --git a/lite/tools/build_ios.sh b/lite/tools/build_ios.sh
new file mode 100755
index 0000000000000000000000000000000000000000..61b2cc27b084f68d36e25ac9813c5dd6cdab3d1d
--- /dev/null
+++ b/lite/tools/build_ios.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+set +x
+
+#####################################################################################################
+# 1. global variables, you can change them according to your requirements
+#####################################################################################################
+# armv7 or armv8, default armv8.
+ARM_ABI=armv8
+# ON or OFF, default OFF.
+WITH_EXTRA=OFF
+# controls whether to compile cv functions into lib, default is OFF.
+WITH_CV=OFF
+# controls whether to hide log information, default is ON.
+SHUTDOWN_LOG=ON
+# absolute path of Paddle-Lite.
+workspace=$PWD/$(dirname $0)/../../
+# options of striping lib according to input model.
+OPTMODEL_DIR=""
+WITH_STRIP=OFF
+# num of threads used during compiling..
+readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
+#####################################################################################################
+
+
+#####################################################################################################
+# 2. local variables, these variables should not be changed.
+#####################################################################################################
+# on mac environment, we should expand the maximum file num to compile successfully
+os_name=`uname -s`
+if [ ${os_name} == "Darwin" ]; then
+   ulimit -n 1024
+fi
+#####################################################################################################
+
+####################################################################################################
+# 3. compiling functions
+####################################################################################################
+function make_ios {
+    local abi=$1
+
+    if [ ${abi} == "armv8" ]; then
+        local os=ios64
+    elif [ ${abi} == "armv7" ]; then
+        local os=ios
+    else
+        echo -e "Error: unsupported arm_abi: ${abi} \t --arm_abi: armv8|armv7"
+        exit 1
+    fi
+
+    build_dir=$workspace/build.ios.${os}.${abi}
+    if [ -d $build_dir ]
+    then
+        rm -rf $build_dir
+    fi
+    echo "building ios target into $build_dir"
+    echo "target abi: $abi"
+    mkdir -p ${build_dir}
+    cd ${build_dir}
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    cmake $workspace \
+            -DWITH_LITE=ON \
+            -DLITE_WITH_ARM=ON \
+            -DLITE_ON_TINY_PUBLISH=ON \
+            -DLITE_WITH_OPENMP=OFF \
+            -DWITH_ARM_DOTPROD=OFF \
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+            -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
+            -DLITE_BUILD_TAILOR=$WITH_STRIP \
+            -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
+            -DARM_TARGET_ARCH_ABI=$abi \
+            -DLITE_BUILD_EXTRA=$WITH_EXTRA \
+            -DLITE_WITH_CV=$WITH_CV \
+            -DARM_TARGET_OS=$os
+
+    make publish_inference -j$NUM_PROC
+    cd -
+}
+
+
+function print_usage {
+    echo "----------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "| Methods of compiling Padddle-Lite iOS library:                                                                                       |"
+    echo "----------------------------------------------------------------------------------------------------------------------------------------"
+    echo -e "|  compile iOS armv8 library:                                                                                                          |"
+    echo -e "|     ./lite/tools/build_ios.sh                                                                                                        |"
+    echo -e "|  compile iOS armv7 library:                                                                                                          |"
+    echo -e "|     ./lite/tools/build_ios.sh  --arm_abi=armv7                                                                                       |"
+    echo -e "|  print help information:                                                                                                             |"
+    echo -e "|     ./lite/tools/build_ios.sh help                                                                                                   |"
+    echo -e "|                                                                                                                                      |"
+    echo -e "|  optional argument:                                                                                                                  |"
+    echo -e "|     --arm_abi: (armv8|armv7), default is armv8                                                                                       |"
+    echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                           |"
+    echo -e "|     --shutdown_log: (OFF|ON); controls whether to hide log information, default is ON                                                |"
+    echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)  |"
+    echo -e "|                                                                                                                                      |"
+    echo -e "|  arguments of striping lib according to input model:(armv8, gcc, c++_static)                                                         |"
+    echo -e "|     ./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir                                              |"
+    echo -e "|     --with_strip: (OFF|ON); controls whether to strip lib accrding to input model, default is OFF                                    |"
+    echo -e "|     --opt_model_dir: (absolute path to optimized model dir) required when compiling striped library                                  |"
+    echo -e "|  detailed information about striping lib:  https://paddle-lite.readthedocs.io/zh/latest/user_guides/library_tailoring.html           |"
+    echo "----------------------------------------------------------------------------------------------------------------------------------------"
+
+}
+
+function main {
+    if [ -z "$1" ]; then
+        make_ios $ARM_ABI
+        exit -1
+    fi
+
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                make_ios $ARM_ABI
+                shift
+                ;;
+            --with_extra=*)
+                WITH_EXTRA="${i#*=}"
+                shift
+                ;;
+            --with_cv=*)
+                WITH_CV="${i#*=}"
+                shift
+                ;;
+            --opt_model_dir=*)
+                OPTMODEL_DIR="${i#*=}"
+                shift
+                ;;
+            --with_strip=*)
+                WITH_STRIP="${i#*=}"
+                shift
+                ;;
+            --shutdown_log=*)
+                SHUTDOWN_LOG="${i#*=}"
+                shift
+                ;;
+            help)
+                print_usage
+                exit 0
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+        make_ios $ARM_ABI
+    done
+}
+
+main $@
diff --git a/lite/tools/untar.py b/lite/tools/untar.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca24ee1626ac7c1f07718238e8513337e432681
--- /dev/null
+++ b/lite/tools/untar.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile, os
+import sys
+
+
+def untar(fname, dirs):
+    """
+    extract the tar.gz file
+    :param fname: the name of tar.gz file
+    :param dirs: the path of decompressed file 
+    :return: bool
+    """
+    try:
+        t = tarfile.open(name=fname, mode='r:gz')
+        t.extractall(path=dirs)
+        return True
+    except Exception as e:
+        print(e)
+        return False
+
+
+untar(sys.argv[1], sys.argv[2])
diff --git a/lite/utils/CMakeLists.txt b/lite/utils/CMakeLists.txt
index ea7bfc97a5a35d7e178aa21b4d55605a617eb0d3..ec5811882966cab828148760f7924cf33f25cf94 100644
--- a/lite/utils/CMakeLists.txt
+++ b/lite/utils/CMakeLists.txt
@@ -3,7 +3,7 @@
 # else()
 # endif()
 
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL OR LITE_SHUTDOWN_LOG)
   lite_cc_library(logging SRCS logging.cc)
   set(utils_DEPS logging)
   lite_cc_test(test_logging SRCS logging_test.cc DEPS ${utils_DEPS})
diff --git a/lite/utils/cp_logging.h b/lite/utils/cp_logging.h
index cc10bece471af7a99f3b271990dd13731c08b9f8..a9970c72b033ff760e2237ce45e7b75d77d8835a 100644
--- a/lite/utils/cp_logging.h
+++ b/lite/utils/cp_logging.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #if defined(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || \
-    defined(LITE_ON_MODEL_OPTIMIZE_TOOL)
+    defined(LITE_ON_MODEL_OPTIMIZE_TOOL) || defined(LITE_SHUTDOWN_LOG)
 #include "lite/utils/logging.h"
 #else  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #include <glog/logging.h>
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index 3d97f4dbec1e4973295248c94c4156563dfb4f5d..97eb916ff15db35c0cd3a7cd240483f83e1a5a27 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -36,11 +36,11 @@
 // Android log macors
 #define ANDROID_LOG_TAG "Paddle-Lite"
 #define ANDROID_LOG_I(msg) \
-  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, msg)
+  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, "%s", msg)
 #define ANDROID_LOG_W(msg) \
-  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, msg)
+  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, "%s", msg)
 #define ANDROID_LOG_F(msg) \
-  __android_log_print(ANDROID_LOG_FATAL, ANDROID_LOG_TAG, msg)
+  __android_log_print(ANDROID_LOG_FATAL, ANDROID_LOG_TAG, "%s", msg)
 #endif
 
 // NOLINTFILE()
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
index 536ab11313528830bf8ec73f68581fba44509f0e..e04db5d1e8d6e2a75343cbee15269d607f71b7c9 100755
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -14,13 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-// some platform-independent defintion
-#include "lite/utils/macros.h"
-
 #ifdef PADDLE_MOBILE_CPU
 #define LOAD_CPU_OP(op_type)                                           \
   extern int TouchOpRegistrar_##op_type##_##cpu();                     \
-  static int use_op_itself_##op_type##_##cpu UNUSED = \
+  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
       TouchOpRegistrar_##op_type##_##cpu()
 #else
 #define LOAD_CPU_OP(op_type)
@@ -29,7 +26,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_CL
 #define LOAD_GPU_CL_OP(op_type)                                       \
   extern int TouchOpRegistrar_##op_type##_##cl();                     \
-  static int use_op_itself_##op_type##_##cl UNUSED = \
+  static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \
       TouchOpRegistrar_##op_type##_##cl()
 #else
 #define LOAD_GPU_CL_OP(op_type)
@@ -38,7 +35,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_FPGA
 #define LOAD_FPGA_OP(op_type)                                           \
   extern int TouchOpRegistrar_##op_type##_##fpga();                     \
-  static int use_op_itself_##op_type##_##fpga UNUSED = \
+  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
       TouchOpRegistrar_##op_type##_##fpga()
 #else
 #define LOAD_FPGA_OP(op_type)
@@ -46,7 +43,7 @@ limitations under the License. */
 
 #define LOAD_FUSION_MATCHER(op_type)                                       \
   extern int TouchFusionMatcherRegistrar_##op_type();                      \
-  static int use_fusion_matcher_itself_##op_type UNUSED = \
+  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
       TouchFusionMatcherRegistrar_##op_type();
 
 #define LOAD_OP(op_type)   \
diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp
index 31274743f8b1d4b3d8195526e1ae77129c2729bb..2e422a3b327683989a08757fd287a370d6185d1f 100644
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
@@ -75,7 +75,17 @@ void Loader<GPU_CL, float>::InitMemoryFromProgram(
         } else {
           auto dim = var_desc->Tensor_desc().Dims();
           PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
+          if (dim.size() == 0) {
+            auto tensor = var->GetMutable<LoDTensor>();
+            framework::DDim dDim = {0};
+            tensor->Resize(dDim);
+          } else {
+            for (auto &d : dim) {
+              if (d < 0) {
+                d *= -1;
+              }
+            }
+          }
           auto cl_image = var->GetMutable<framework::CLImage>();
           cl_image->Resize(make_ddim(dim));
         }
diff --git a/mobile/src/operators/bilinear_interp_op.cpp b/mobile/src/operators/bilinear_interp_op.cpp
index 8dcf743a066d80692269160bbb863b2887b0cd3d..ef5d23087370f1daf551a1e7a945106810a71e84 100644
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ b/mobile/src/operators/bilinear_interp_op.cpp
@@ -30,7 +30,10 @@ void BilinearOp<DeviceType, T>::InferShape() const {
   int out_h = this->param_.OutH();
   int out_w = this->param_.OutW();
   PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
-
+  bool ignore_scale = false;
+  if (out_h > 0 && out_w > 0) {
+    ignore_scale = true;
+  }
   if (this->param_.InputOutPutSize() != nullptr) {
     auto out_size_dim = this->param_.InputOutPutSize()->dims();
 
@@ -38,8 +41,21 @@ void BilinearOp<DeviceType, T>::InferShape() const {
                           "OutSize's dimension size must be 1");
     PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
   }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  this->param_.Out()->Resize(framework::make_ddim(dim_out));
+
+  if (this->param_.HasScale() && !ignore_scale) {
+    const float scale = this->param_.Scale();
+    DLOG << "scale_:  " << scale;
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
+                                  static_cast<int>(dim_x[2] * scale),
+                                  static_cast<int>(dim_x[3] * scale)});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+
+  } else {
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+  }
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
index a94c8299c514bc9e2937daf57b1a845d7be56b16..29d63937ba59debf75da6ac5c5d31d50ab6abfa7 100644
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
@@ -18,6 +18,44 @@ limitations under the License. */
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+
+void softmax_basic_axis_float(const float *din, float *dout,
+                              const int axis_size, const int inner_num,
+                              const int outer_num) {
+  int compute_size = inner_num * outer_num;
+#pragma omp parallel for
+  for (int i = 0; i < compute_size; ++i) {
+    int idx_inner = i % inner_num;
+    int idx_outer = (i / inner_num) * axis_size;
+    int real_index = idx_outer * inner_num + idx_inner;
+
+    float max_data = din[real_index];
+    // get max
+    for (int j = 1; j < axis_size; ++j) {
+      real_index += inner_num;
+      max_data = din[real_index] > max_data ? din[real_index] : max_data;
+    }
+
+    real_index = idx_outer * inner_num + idx_inner;
+    // sub, exp and sum
+    dout[real_index] = expf(din[real_index] - max_data);
+    float sum_data = dout[real_index];
+    for (int j = 1; j < axis_size; ++j) {
+      real_index += inner_num;
+      dout[real_index] = expf(din[real_index] - max_data);
+      sum_data += dout[real_index];
+    }
+
+    float sum_inv = 1.f / sum_data;
+    real_index = idx_outer * inner_num + idx_inner;
+    // get softmax result
+    for (int j = 0; j < axis_size; ++j) {
+      dout[real_index] *= sum_inv;
+      real_index += inner_num;
+    }
+  }
+}
+
 template <typename P>
 void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
   const Tensor *in_x = param.InputX();
@@ -25,7 +63,29 @@ void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
   auto x_dims = in_x->dims();
   out->Resize(x_dims);
   out->mutable_data<float>();
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  if (param.has_axis_) {
+    int axis = param.axis_;
+    int axis_size = x_dims[axis];
+    auto x_rank = x_dims.size();
+    DLOG << "x_rank :" << x_rank;
+
+    if (axis < 0) {
+      axis += x_rank;
+    }
+
+    DLOG << "axis :" << axis;
+
+    int outer_num = framework::product(framework::slice_ddim(x_dims, 0, axis));
+    DLOG << "outer_num :" << outer_num;
+    int inner_num =
+        framework::product(framework::slice_ddim(x_dims, axis + 1, x_rank));
+    DLOG << "inner_num :" << inner_num;
+
+    softmax_basic_axis_float(in_x->data<float>(), out->data<float>(), axis_size,
+                             inner_num, outer_num);
+  } else {
+    math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
index 6937c334c809dca340a4dbb69a758ad9238b86d3..fa504a6ed19503553be99180fc2a748e3f59643a 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
@@ -13,70 +13,75 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void bilinear_interp(__read_only image2d_t input, __write_only image2d_t output,
-                             __private const float scale_h, __private const float scale_w,
-                             __private const int in_dims_h, __private const int out_dims_h,
-                             __private const int in_dims_w, __private const int out_dims_w,
-                             __private const float align_delta) {
-    const int c = get_global_id(0);
-    const int w = get_global_id(1);
-    const int nh = get_global_id(2);
+__kernel void bilinear_interp(
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const float scale_h, __private const float scale_w,
+    __private const int in_dims_h, __private const int out_dims_h,
+    __private const int in_dims_w, __private const int out_dims_w,
+    __private const float align_delta) {
+  const int c = get_global_id(0);
+  const int w = get_global_id(1);
+  const int nh = get_global_id(2);
 
-    int2 output_pos;
-    output_pos.x = c * out_dims_w + w;
-    output_pos.y = nh;
+  int2 output_pos;
+  output_pos.x = c * out_dims_w + w;
+  output_pos.y = nh;
 
-    // calculate center pixel's pos
-    int out_n = nh / out_dims_h;
-    int out_h = nh % out_dims_h;
-    float center_w = (w + align_delta)  * scale_w - align_delta;
-    float center_h = (out_h + align_delta) * scale_h - align_delta;
+  // calculate center pixel's pos
+  int out_n = nh / out_dims_h;
+  int out_h = nh % out_dims_h;
+  float center_w = (w + align_delta) * scale_w - align_delta;
+  float center_h = (out_h + align_delta) * scale_h - align_delta;
 
-    int floor_w = (int)center_w;
-    int floor_h = (int)center_h;
-    int ceil_w = floor_w + 1;
-    int ceil_h = floor_h + 1;
+  int floor_w = (int)center_w;
+  int floor_h = (int)center_h;
+  int ceil_w = floor_w + 1;
+  int ceil_h = floor_h + 1;
 
-    if (ceil_w > in_dims_w) {
-        ceil_w = floor_w;
-    }
-    if (ceil_h > in_dims_h) {
-        ceil_h = floor_h;
-    }
-    float wight0_w = center_w - floor_w;
-    float wight0_h = center_h - floor_h;
-    float wight1_w = 1.0 - wight0_w;
-    float wight1_h = 1.0 - wight0_h;
+  if (ceil_w > in_dims_w) {
+    ceil_w = floor_w;
+  }
+  if (ceil_h > in_dims_h) {
+    ceil_h = floor_h;
+  }
+  float wight0_w = center_w - floor_w;
+  float wight0_h = center_h - floor_h;
+  float wight1_w = 1.0f - wight0_w;
+  float wight1_h = 1.0f - wight0_h;
 
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-    // get left up pixel data
-    int2 left_up;
-    left_up.x = c * in_dims_w + floor_w;
-    left_up.y = out_n * in_dims_h + ceil_h;
-    half4 left_up_data = read_imageh(input, sampler, left_up);
+  // get left up pixel data
+  int2 left_up;
+  left_up.x = c * in_dims_w + floor_w;
+  left_up.y = out_n * in_dims_h + ceil_h;
+  half4 left_up_data = read_imageh(input, sampler, left_up);
 
-    // get left down pixel data
-    int2 left_down;
-    left_down.x = c * in_dims_w + floor_w;
-    left_down.y = out_n * in_dims_h + floor_h;
-    half4 left_down_data = read_imageh(input, sampler, left_down);
+  // get left down pixel data
+  int2 left_down;
+  left_down.x = c * in_dims_w + floor_w;
+  left_down.y = out_n * in_dims_h + floor_h;
+  half4 left_down_data = read_imageh(input, sampler, left_down);
 
-    // get right up pixel data
-    int2 right_up;
-    right_up.x = c * in_dims_w + ceil_w;
-    right_up.y = out_n * in_dims_h + ceil_h;
-    half4 right_up_data = read_imageh(input, sampler, right_up);
+  // get right up pixel data
+  int2 right_up;
+  right_up.x = c * in_dims_w + ceil_w;
+  right_up.y = out_n * in_dims_h + ceil_h;
+  half4 right_up_data = read_imageh(input, sampler, right_up);
 
-    // get right down pixel's data
-    int2 right_down;
-    right_down.x = c * in_dims_w + ceil_w;
-    right_down.y = out_n * in_dims_h + floor_h;
-    half4 right_down_data = read_imageh(input, sampler, right_down);
+  // get right down pixel's data
+  int2 right_down;
+  right_down.x = c * in_dims_w + ceil_w;
+  right_down.y = out_n * in_dims_h + floor_h;
+  half4 right_down_data = read_imageh(input, sampler, right_down);
 
-    // calculate output data
-    half4 data = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
-            + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
+  // calculate output data
+  half4 data =
+      (left_down_data * (half)wight1_w + right_down_data * (half)wight0_w) *
+          (half)wight1_h +
+      (left_up_data * (half)wight1_w + right_up_data * (half)wight0_w) *
+          (half)wight0_h;
 
-    write_imageh(output, output_pos, data);
+  write_imageh(output, output_pos, data);
 }
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 036ffd5c321a072e66d8748233ca2528cf5a8b86..758f60b4fb3a2cc9584ef642171eb33ecfdb79b4 100644
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -202,7 +202,6 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
     //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
     //
     //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
     // std::cout << " input dim " << param->Input()->dims()[0] << "  "
@@ -218,7 +217,15 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
     //           param->Filter()->dims()[2]
     //           << " " << param->Filter()->dims()[3] << " " << std::endl;
 
-    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
+    if (param->groups > 1) {
+      param->ExecMode() =
+          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
+    }
     //    }
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
@@ -236,7 +243,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
                             param.NewScale(), param.NewBias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
index 94ffc001b4cbba7dc31f5073612cc01b47b7ec5c..5f21d3dd3e591e88555dcd9d0a9c1b01a1f38245 100644
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -96,10 +96,18 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
     //
     //    } else {
 
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
+
+    if (param->groups > 1) {
+      param->ExecMode() =
+          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
+    }
     //    }
 
   } else if (param->Filter()->dims()[2] == 7 &&
@@ -130,6 +138,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
       WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
index bb27baecd484a75c2be4998205f9e229dc6c49e5..16281e5cb78358ea5a6caacf3413a1b41a92b820 100644
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -96,7 +96,6 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
     //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
     //
     //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
     // std::cout << " input dim " << param->Input()->dims()[0] << "  "
@@ -112,7 +111,16 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
     //           param->Filter()->dims()[2]
     //           << " " << param->Filter()->dims()[3] << " " << std::endl;
 
-    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
+    if (param->groups > 1) {
+      param->ExecMode() =
+          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
+    }
+
     //    }
 
   } else if (param->Filter()->dims()[2] == 7 &&
@@ -146,7 +154,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
       WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
index dc71ca5589b5655e3a5fca04448b7b84041942ba..bd8b71b85da8d9a6ca8826732a5d6eb9d741f629 100644
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -153,11 +153,18 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
     //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
     //
     //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
 
-    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
+    if (param->groups > 1) {
+      param->ExecMode() =
+          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
+    }
     //    }
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
@@ -174,7 +181,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
                             param.NewScale(), param.NewBias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp
index dff4039fc0628891763988b11e04c3197c4fec7b..054eab85ab3d071204a902a6673c0176ff09e3da 100644
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -90,7 +90,6 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
     //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
     //
     //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
     // std::cout << " input dim " << param->Input()->dims()[0] << "  "
@@ -105,8 +104,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
     //           << param->Filter()->dims()[1] << " " <<
     //           param->Filter()->dims()[2]
     //           << " " << param->Filter()->dims()[3] << " " << std::endl;
+    if (param->groups > 1) {
+      param->ExecMode() =
+          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file);
+    }
 
-    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file);
     //    }
     DLOG << "conv 3x3";
   } else if (param->Filter()->dims()[2] == 7 &&
@@ -132,7 +138,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
       WinogradConv3x3<4, 3>(&this->cl_helper_, param);
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
index ab1f962c3b867b9cc6431d04876ca40b60367576..35511331a5755f7c26212f578f0c5bcc5a2b46f0 100644
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -96,29 +96,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
     //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
     //
     //    } else {
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    if (param->groups > 1) {
+      param->ExecMode() =
+          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
     } else {
       param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      // std::cout << " input dim " << param->Input()->dims()[0] << "  "
-      //           << param->Input()->dims()[1] << "  "
-      //           << param->Input()->dims()[2] << "  "
-      //           << param->Input()->dims()[3] << "  " << std::endl;
-      // std::cout << " output dim " << param->Output()->dims()[0] << " "
-      //           << param->Output()->dims()[1] << " "
-      //           << param->Output()->dims()[2] << " "
-      //           << param->Output()->dims()[3] << " " << std::endl;
-      // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
-      //           << param->Filter()->dims()[1] << " "
-      //           << param->Filter()->dims()[2] << " "
-      //           << param->Filter()->dims()[3] << " " << std::endl;
-
       this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
                                  build_options);
     }
@@ -140,7 +125,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
       WinogradConv3x3<4, 3>(&this->cl_helper_, param, true);
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true);
diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
index 439554ec10696913b42923177828870790f0f711..d0f377faee8667a43d3286309e95e8673d9a6a62 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -30,8 +30,6 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
     build_options = "-DLOCAL_MEM_128";
   } else if (h == 64) {
     build_options = "-DLOCAL_MEM_64";
-  } else if (h > 256) {
-    PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
   }
   this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
                              build_options);
diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
index 270d77c4a051df227719338f6793e64aa2920f9f..bd1d1f87424d48be92777f7e7a72f08b66aa07c7 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -26,13 +26,11 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
     FusionInstanceNormReluParam<GPU_CL> *param) {
   auto &dims = param->Out()->dims();
   const int h = dims[2];
-  std::string build_options = "-DRELU";
+  std::string build_options = " -DRELU";
   if (h == 128) {
     build_options += " -DLOCAL_MEM_128";
   } else if (h == 64) {
     build_options += " -DLOCAL_MEM_64";
-  } else if (h > 256) {
-    PADDLE_MOBILE_THROW_EXCEPTION("instance norm unsupported input height");
   }
   this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
                              build_options);
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
index 8ecb1e2d25ed1f1a463993c19afd37b6d10fae1d..8ef339e82e6e173a31cc5dfc53820c68e0f44746 100644
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -494,6 +494,7 @@ class ConvParam : public OpParam {
     EXEC_DEPTHWISE3x3_FLOAT,
     EXEC_SLIDINGWINDOW1x1_FLOAT,
     EXEC_SLIDINGWINDOW3x3_FLOAT,
+    EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT,
     EXEC_SLIDINGWINDOW5x5_FLOAT,
     EXEC_SLIDINGWINDOW7x7_FLOAT,
     EXEC_GEMM1x1s1_FLOAT,
@@ -1180,10 +1181,17 @@ class SoftmaxParam : public OpParam {
       : OpParam(inputs, outputs, attrs, scope) {
     input_x_ = InputXFrom<GType>(inputs, *scope);
     out_ = OutFrom<GType>(outputs, *scope);
+    if (HasAttr("axis", attrs)) {
+      axis_ = GetAttr<int>("axis", attrs);
+      has_axis_ = true;
+    }
   }
   const GType *InputX() const { return input_x_; }
   GType *Out() const { return out_; }
 
+  int axis_ = -1;
+  bool has_axis_ = false;
+
  private:
   GType *input_x_;
   GType *out_;
@@ -3083,6 +3091,12 @@ class BilinearInterpParam : public OpParam {
     out_w_ = GetAttr<int>("out_w", attrs);
     align_corners = GetAttr<bool>("align_corners", attrs);
     align_mode = GetAttr<int>("align_mode", attrs);
+    if (HasAttr("scale", attrs)) {
+      has_scale_ = true;
+      scale_ = GetAttr<float>("scale", attrs);
+    }
+    LOG(kLOG_DEBUG1) << "has_scale_:  " << has_scale_;
+    LOG(kLOG_DEBUG1) << "scale_:  " << scale_;
   }
   const GType *InputX() const { return input_x_; }
   const GType *InputOutPutSize() const { return input_outsize_; }
@@ -3091,6 +3105,8 @@ class BilinearInterpParam : public OpParam {
   int OutW() const { return out_w_; }
   bool AlignCorners() const { return align_corners; }
   int AlignMode() const { return align_mode; }
+  float Scale() const { return scale_; }
+  bool HasScale() const { return has_scale_; }
 
  private:
   GType *input_x_;
@@ -3100,6 +3116,8 @@ class BilinearInterpParam : public OpParam {
   int out_w_;
   bool align_corners;
   int align_mode;
+  float scale_;
+  bool has_scale_;
 };
 #endif
 
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
index 6dddeb47f6e33446d136a8d1301834aa17fceeb8..9fbf33da90f3eba4738cf6118aeb0bd6afe03553 100644
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -549,12 +549,19 @@ if (ENABLE_ALL_TEST)
         ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-net-performance paddle-mobile)
 
-        ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-inference-api-v2 paddle-mobile)
+        ADD_EXECUTABLE(test-infer-imfix net/test_inference_imfix.cpp test_helper.h test_include.h executor_for_test.h)
+        target_link_libraries(test-infer-imfix paddle-mobile)
+
+#        ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
+#        target_link_libraries(test-inference-api-v2 paddle-mobile)
 
         if (GPU_CL)
             ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h)
             target_link_libraries(test-net-male2fe paddle-mobile)
+
+            ADD_EXECUTABLE(test-infer-m2fm net/test_inference_m2fm.cpp test_helper.h test_include.h executor_for_test.h)
+            target_link_libraries(test-infer-m2fm  paddle-mobile)
+
         endif()
 
     endif ()
@@ -566,6 +573,6 @@ else ()
     ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
     target_link_libraries(test-net-benchmark paddle-mobile)
 
-    ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-inference-api-v2 paddle-mobile)
+#    ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
+#    target_link_libraries(test-inference-api-v2 paddle-mobile)
 endif ()
diff --git a/mobile/test/net/test_inference_imfix.cpp b/mobile/test/net/test_inference_imfix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dacc35f7d0cb51ba25c344e32c21d1d78aa923f7
--- /dev/null
+++ b/mobile/test/net/test_inference_imfix.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;  // NOLINT
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kGPU_CL;
+  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
+
+  config.prog_file = "../models/imagefixmodel/model";
+  config.param_file = "../models/imagefixmodel/params";
+  config.lod_mode = false;
+  config.load_when_predict = false;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  // factor
+  int input_rgb_len = 1 * 3 * 256 * 256;
+  std::vector<float> input_rgb_v(input_rgb_len, 1);
+  // SetupData<float>(input_rgb_v.data(), input_rgb_len, 0.f, 1.f);
+
+  PaddleTensor input_rgb;
+  input_rgb.shape = std::vector<int>({1, 3, 256, 256});
+  input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
+  input_rgb.dtype = PaddleDType::FLOAT32;
+  input_rgb.layout = LayoutType::LAYOUT_CHW;
+
+  // remap
+  int input_mask_len = 1 * 3 * 256 * 256;
+  std::vector<float> input_mask_v(input_mask_len, 1);
+  // SetupData<float>(input_mask_v.data(), input_mask_len, 0.f, 1.f);
+
+  PaddleTensor input_mask;
+  input_mask.shape = std::vector<int>({1, 3, 256, 256});
+  input_mask.data =
+      PaddleBuf(input_mask_v.data(), input_mask_len * sizeof(float));
+  input_mask.dtype = PaddleDType::FLOAT32;
+  input_mask.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output0;
+  output0.shape = std::vector<int>({});
+  output0.data = PaddleBuf();
+  output0.dtype = PaddleDType::FLOAT32;
+  output0.layout = LayoutType::LAYOUT_CHW;
+
+  // PaddleTensor output1;
+  // output1.shape = std::vector<int>({});
+  // output1.data = PaddleBuf();
+  // output1.dtype = PaddleDType::FLOAT32;
+  // output1.layout = LayoutType::LAYOUT_CHW;
+
+  // PaddleTensor output2;
+  // output2.shape = std::vector<int>({});
+  // output2.data = PaddleBuf();
+  // output2.dtype = PaddleDType::FLOAT32;
+  // output2.layout = LayoutType::LAYOUT_CHW;
+
+  // PaddleTensor output3;
+  // output3.shape = std::vector<int>({});
+  // output3.data = PaddleBuf();
+  // output3.dtype = PaddleDType::FLOAT32;
+  // output3.layout = LayoutType::LAYOUT_CHW;
+  std::cout << "feed : " << std::endl;
+
+  predictor->Feed("input_rgb", input_rgb);
+
+  std::cout << "feed : " << std::endl;
+
+  predictor->Feed("input_mask", input_mask);
+
+  std::cout << "run : " << std::endl;
+
+  predictor->Run();
+
+  std::cout << "fetch : " << std::endl;
+
+  predictor->Fetch("save_infer_model/scale_0", &output0);
+
+  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
+  std::cout << " print output0 : " << std::endl;
+  int numel = output0.data.length() / sizeof(float);
+  int stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr0[j] << " ";
+  }
+  std::cout << std::endl;
+
+  return 0;
+}
diff --git a/mobile/test/net/test_inference_m2fm.cpp b/mobile/test/net/test_inference_m2fm.cpp
index fe03c99cda992b06c49e0165ad64d8289f165880..b40c81ee544346e2db947b2c4a3a990d90d6f666 100644
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ b/mobile/test/net/test_inference_m2fm.cpp
@@ -24,8 +24,8 @@ PaddleMobileConfig GetConfig() {
   config.device = PaddleMobileConfig::kGPU_CL;
   config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
 
-  config.prog_file = "../models/m2fm/model";
-  config.param_file = "../models/m2fm/params";
+  config.prog_file = "../models/gan_yanlong_check2/model";
+  config.param_file = "../models/gan_yanlong_check2/params";
   config.lod_mode = false;
   config.load_when_predict = false;
   return config;
diff --git a/mobile/test/net/test_mobilenet_GPU.cpp b/mobile/test/net/test_mobilenet_GPU.cpp
index fdf1bf3158b9b84a2b5c9dad2e75749514e3fd24..8848f23d397c80cc1f4d3abda0c064cda659b841 100644
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ b/mobile/test/net/test_mobilenet_GPU.cpp
@@ -38,8 +38,7 @@ int main(int argc, char **argv) {
               << " <input_w>\n"
               << " <repeats>\n"
               << " <warmup>\n"
-              << " <print_output>"
-              << std::endl;
+              << " <print_output>" << std::endl;
     return 0;
   }
 
@@ -54,11 +53,8 @@ int main(int argc, char **argv) {
     print_output_elem = atoi(argv[8]);
   }
 
-  std::cout << "input shape(NCHW):"
-            << N << " "
-            << C << " "
-            << H << " "
-            << W << std::endl;
+  std::cout << "input shape(NCHW):" << N << " " << C << " " << H << " " << W
+            << std::endl;
   std::cout << "repeats:" << repeats << std::endl;
   std::cout << "model_dir:" << model_dir << std::endl;
 
@@ -76,9 +72,8 @@ int main(int argc, char **argv) {
   }
 
   auto load_end = paddle_mobile::time();
-  std::cout << "load cost:"
-            << paddle_mobile::time_diff(load_start, load_end)
-            << " ms"  << std::endl;
+  std::cout << "load cost:" << paddle_mobile::time_diff(load_start, load_end)
+            << " ms" << std::endl;
 
   // input tensor
   std::vector<float> input;
@@ -104,8 +99,8 @@ int main(int argc, char **argv) {
     sum_duration += duration;
     min_duration = (duration > min_duration) ? min_duration : duration;
     max_duration = (duration < max_duration) ? max_duration : duration;
-    std::cout << "ridx:" << ridx + 1 << "/" << repeats
-              << " " << duration << " ms" << std::endl;
+    std::cout << "ridx:" << ridx + 1 << "/" << repeats << " " << duration
+              << " ms" << std::endl;
   }
 
   // benchmark result
diff --git a/mobile/tools/build_android_armv7.sh b/mobile/tools/build_android_armv7.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9466aa300ee6c1f6b79d4e7dd082cff7cc310eca
--- /dev/null
+++ b/mobile/tools/build_android_armv7.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# merge cl to so
+merge_cl_to_so=1
+opencl_kernels="opencl_kernels.cpp"
+cd ../src/operators/kernel/cl
+if [[ -f "${opencl_kernels}" ]]; then
+    rm "${opencl_kernels}"
+fi
+python gen_code.py "${merge_cl_to_so}" >"${opencl_kernels}"
+cd -
+
+# get cl headers
+opencl_header_dir="../third_party/opencl/OpenCL-Headers"
+commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6"
+if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then
+    echo "pulling opencl headers"
+    cd $opencl_header_dir
+    git stash
+    git pull
+    git checkout $commit_id
+    cd -
+else
+    echo "cloning opencl headers"
+    rm -rf $opencl_header_dir
+    git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir
+    git checkout $commit_id
+fi
+
+build_for_android() {
+    # rm -rf "../build"
+    if [ -z "${NDK_ROOT}" ]; then
+        echo "NDK_ROOT not found!"
+        exit -1
+    fi
+
+    if [ -z "$PLATFORM" ]; then
+        PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
+        # PLATFORM="arm-v8a"
+    fi
+
+    if [ "${PLATFORM}" = "arm-v7a" ]; then
+        ABI="armeabi-v7a with NEON"
+        ARM_PLATFORM="V7"
+        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+    elif [ "${PLATFORM}" = "arm-v8a" ]; then
+        ABI="arm64-v8a"
+        ARM_PLATFORM="V8"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold"
+    else
+        echo "unknown platform!"
+        exit -1
+    fi
+
+    MODE="Release"
+    ANDROID_PLATFORM_VERSION="android-19"
+    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
+    ANDROID_ARM_MODE="arm"
+
+    cmake .. \
+        -B"../buildreleasev7/${PLATFORM}" \
+        -DANDROID_ABI="${ABI}" \
+        -DCMAKE_BUILD_TYPE="${MODE}" \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+        -DANDROID_STL=c++_static \
+        -DANDROID=true \
+        -DWITH_LOGGING=OFF \
+        -DWITH_PROFILE=OFF \
+        -DWITH_TEST=OFF \
+        -D"${ARM_PLATFORM}"=true
+
+    cd "../buildreleasev7/${PLATFORM}"
+    make -j 8
+}
+
+build_for_android
diff --git a/mobile/tools/build_android_armv8.sh b/mobile/tools/build_android_armv8.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3517227eaaf9cef4ce22fce9cfe1cbcd87d2a7a5
--- /dev/null
+++ b/mobile/tools/build_android_armv8.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# merge cl to so
+merge_cl_to_so=1
+opencl_kernels="opencl_kernels.cpp"
+cd ../src/operators/kernel/cl
+if [[ -f "${opencl_kernels}" ]]; then
+    rm "${opencl_kernels}"
+fi
+python gen_code.py "${merge_cl_to_so}" >"${opencl_kernels}"
+cd -
+
+# get cl headers
+opencl_header_dir="../third_party/opencl/OpenCL-Headers"
+commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6"
+if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then
+    echo "pulling opencl headers"
+    cd $opencl_header_dir
+    git stash
+    git pull
+    git checkout $commit_id
+    cd -
+else
+    echo "cloning opencl headers"
+    rm -rf $opencl_header_dir
+    git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir
+    git checkout $commit_id
+fi
+
+build_for_android() {
+    # rm -rf "../build"
+    if [ -z "${NDK_ROOT}" ]; then
+        echo "NDK_ROOT not found!"
+        exit -1
+    fi
+
+    if [ -z "$PLATFORM" ]; then
+        # PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
+        PLATFORM="arm-v8a"
+    fi
+
+    if [ "${PLATFORM}" = "arm-v7a" ]; then
+        ABI="armeabi-v7a with NEON"
+        ARM_PLATFORM="V7"
+        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+    elif [ "${PLATFORM}" = "arm-v8a" ]; then
+        ABI="arm64-v8a"
+        ARM_PLATFORM="V8"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold"
+    else
+        echo "unknown platform!"
+        exit -1
+    fi
+
+    MODE="Release"
+    ANDROID_PLATFORM_VERSION="android-19"
+    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
+    ANDROID_ARM_MODE="arm"
+
+    cmake .. \
+        -B"../buildreleasev8/${PLATFORM}" \
+        -DANDROID_ABI="${ABI}" \
+        -DCMAKE_BUILD_TYPE="${MODE}" \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+        -DANDROID_STL=c++_static \
+        -DANDROID=true \
+        -DWITH_LOGGING=OFF \
+        -DWITH_PROFILE=OFF \
+        -DWITH_TEST=OFF \
+        -D"${ARM_PLATFORM}"=true
+
+    cd "../buildreleasev8/${PLATFORM}"
+    make -j 8
+}
+
+build_for_android
diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake
index cd84b9cbde2252e2947418c5d6f02ea0097f1527..44f2bc0f088950ede560766a8fd130214200e780 100755
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -442,9 +442,9 @@ endif()
 if (FILL_CONSTANT_OP)
   add_definitions(-DFILL_CONSTANT_OP)
 endif()
-if (FUSION_CONVADD_OP)
-  add_definitions(-DFUSION_CONVADD_OP)
-endif()
+# if (FUSION_CONVADD_OP)
+#   add_definitions(-DFUSION_CONVADD_OP)
+# endif()
 if (FUSION_CONVADDRELU_OP)
   add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()