diff --git a/CMakeLists.txt b/CMakeLists.txt
index 377e58d3ac7c37271d2a813b22912528c556164b..d40491f3eecbea5d4da5817c07be9cb27b8ce25e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,7 @@ lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
 lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
-lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE)
+lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index d38c78f62fa2bed4f4483355de0683f1f5b7656b..0d60c578685cd3d3f3adbeac9fc75d1cdcc78c51 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -152,9 +152,10 @@ endif()
 
 if (LITE_WITH_PROFILE)
     add_definitions("-DLITE_WITH_PROFILE")
-    if (LITE_WITH_PRECISION_PROFILE)
-        add_definitions("-DLITE_WITH_PRECISION_PROFILE")
-    endif()
+endif()
+
+if (LITE_WITH_PRECISION_PROFILE)
+    add_definitions("-DLITE_WITH_PRECISION_PROFILE")
 endif()
 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 265de3fbf68542f1b1525257887cbfaa4d1c4d62..780cdea445cf10897ee71c85a939a64406b59c96 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -307,6 +307,9 @@ function(add_kernel TARGET device level)
     if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
         return()
     endif()
+    if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
+        return()
+    endif()
 
 
     if ("${device}" STREQUAL "Host")
@@ -322,16 +325,11 @@ function(add_kernel TARGET device level)
         set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "X86")
-        if (NOT LITE_WITH_X86)
+        if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL)
             foreach(src ${args_SRCS})
                 file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
             endforeach()
             return()
-        elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
-            foreach(src ${args_SRCS})
-                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-            endforeach()
-            return()
         endif()
         set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
@@ -434,11 +432,13 @@ function(add_operator TARGET level)
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-
     if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
         return()
     endif()
 
+    if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
+        return()
+    endif()
 
     foreach(src ${args_SRCS})
       if(LITE_BUILD_TAILOR)
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index a39c0a02681f16578ae81c74d83979fe0c57e6c6..c4dd769b4c9dbda8379aef631b6f44ce3aea9d22 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -108,59 +108,53 @@ if (LITE_WITH_PYTHON)
     add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()
 
+if (LITE_WITH_CUDA OR LITE_WITH_X86)
+    add_custom_target(publish_inference_cxx_lib ${TARGET}
+        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+        COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+    add_custom_target(publish_inference_third_party ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+    add_dependencies(publish_inference_cxx_lib bundle_full_api)
+    add_dependencies(publish_inference_cxx_lib bundle_light_api)
+    add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+    add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+    add_dependencies(publish_inference publish_inference_cxx_lib)
+    add_dependencies(publish_inference publish_inference_third_party)
+endif()
+
 if (LITE_WITH_X86)
     add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
             )
-    add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
     add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
 
     add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            )
     add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
+    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+    add_dependencies(publish_inference publish_inference_x86_cxx_demos)
 endif()
 
 if(LITE_WITH_CUDA)
-    add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        )
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
-
     add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            )
-    add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
     add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
-endif(LITE_WITH_CUDA) 
+    add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
+endif(LITE_WITH_CUDA)
+
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
@@ -192,7 +186,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 add_dependencies(publish_inference publish_inference_cxx_lib)
                 if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                     add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
-                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so)
                 endif()
             endif()
     else()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index b360b476e0c99a62ea39a70241b548bddf5a872a..5980e5ebe0b922e68a06bf84b51c33271d13e360 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -8,11 +8,12 @@ if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
     set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
-set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+
+set(light_lib_DEPS light_api paddle_api paddle_api_light)
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
-    add_library(paddle_full_api_shared SHARED "")
-    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
+    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
+                  DEPS paddle_api paddle_api_light  paddle_api_full)
     add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
     target_link_libraries(paddle_full_api_shared framework_proto)
     if(LITE_WITH_X86)
@@ -27,13 +28,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
     endif(LITE_WITH_CUDA)
 
     #light api dynamic library
-    lite_cc_library(paddle_light_api_shared MODULE
-        SRCS light_api_shared.cc
-        DEPS ${light_lib_DEPS}
-        ARM_DEPS ${arm_kernels}
-        CV_DEPS paddle_cv_arm
-        NPU_DEPS ${npu_kernels})
-
+    lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
+                  DEPS ${light_lib_DEPS}
+                  ARM_DEPS ${arm_kernels}
+                  CV_DEPS paddle_cv_arm
+                  NPU_DEPS ${npu_kernels}
+                  )
+    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
     target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
     set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
     set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
@@ -262,7 +263,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kernels})
+        FPGA_DEPS ${fpga_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        BM_DEPS ${bm_kernels})
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     target_link_libraries(paddle_api_full ${cuda_deps})
@@ -311,7 +315,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling opt")
     lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
-        DEPS gflags kernel op optimizer mir_passes utils)
+        DEPS gflags kernel op optimizer mir_passes utils ${host_kernels})
     add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index d53de7bf2ed00fed70bbd1f70729a051e5d7203b..0843faf0d6b060a5b76a850de069b1dbf714da19 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -44,7 +44,10 @@ DEFINE_string(input_shape,
               "set input shapes according to the model, "
               "separated by colon and comma, "
               "such as 1,3,244,244");
-DEFINE_string(input_img_path, "", "the path of input image");
+DEFINE_string(input_img_path,
+              "",
+              "the path of input image, if not set "
+              "input_img_path, the input of model will be 1.0.");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_int32(power_mode,
@@ -57,16 +60,11 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_string(result_filename,
               "result.txt",
-              "save benchmark "
-              "result to the file");
+              "save the inference time to the file.");
 DEFINE_bool(run_model_optimize,
             false,
             "if set true, apply model_optimize_tool to "
             "model and use optimized model to test. ");
-DEFINE_bool(is_quantized_model,
-            false,
-            "if set true, "
-            "test the performance of the quantized model. ");
 
 namespace paddle {
 namespace lite_api {
@@ -87,10 +85,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
   std::vector<Place> vaild_places = {
       Place{TARGET(kARM), PRECISION(kFloat)},
   };
-  if (FLAGS_is_quantized_model) {
-    vaild_places.insert(vaild_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
   config.set_valid_places(vaild_places);
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
@@ -181,8 +175,8 @@ void Run(const std::vector<int64_t>& input_shape,
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
-    LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
     exit(0);
   }
 
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 556a9e0af01854ff5c57a14dade72b81ed255964..3f3428b434e98ffb0ba578ef7f31a4fbcd9ca619 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"
 
 namespace paddle {
@@ -295,6 +296,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
   inner_places.emplace_back(
       TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
 
+  // Analysis whether the modle is quantized.
+  // For quantized model, add place(arm, int8) to inner_places
   const std::vector<std::string> quant_dequant_op = {
       "fake_quantize_abs_max",
       "fake_quantize_range_abs_max",
@@ -317,7 +320,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
     }
   }
   if (is_quantized_model) {
-    inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
+    inner_places.insert(inner_places.begin(),
+                        Place{TARGET(kARM), PRECISION(kInt8)});
   }
 
   Program program(desc, scope_, inner_places);
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 972210c8f9ea05ba1b041382c43efad64aeacc1b..133b7f7ccf254ca13ab445b0116a684de610896b 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -31,10 +31,17 @@ namespace lite {
 
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   config_ = config;
+  auto places = config.valid_places();
 #ifdef LITE_WITH_CUDA
-  Env<TARGET(kCUDA)>::Init();
+  // if kCUDA is included in valid places, it should be initialized first,
+  // otherwise skip this step.
+  for (auto &p : places) {
+    if (p.target == TARGET(kCUDA)) {
+      Env<TARGET(kCUDA)>::Init();
+      break;
+    }
+  }
 #endif
-  auto places = config.valid_places();
   std::vector<std::string> passes{};
   auto use_layout_preprocess_pass =
       config.model_dir().find("OPENCL_PRE_PRECESS");
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index b641973a15b2e6abc1cf4c999d759271f7522638..f61e2f35241bb2a361e665f37fb58e3bc5226090 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -13,13 +13,9 @@
 // limitations under the License.
 
 #include "lite/api/light_api.h"
+#include <algorithm>
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/api/paddle_use_passes.h"
-#endif
-
-#include <algorithm>
 
 namespace paddle {
 namespace lite {
diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc
deleted file mode 100644
index cfe3d9de09a646e33c4a116bb3cd087d28aa24c2..0000000000000000000000000000000000000000
--- a/lite/api/light_api_shared.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/api/paddle_api.h"
-
-namespace paddle {
-namespace lite_api {
-
-void RunModel() {
-  // 1. Set MobileConfig
-  MobileConfig mobile_config;
-
-  // 2. Create PaddlePredictor by MobileConfig
-  std::shared_ptr<PaddlePredictor> mobile_predictor =
-      CreatePaddlePredictor<MobileConfig>(mobile_config);
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index b8497199684cb4f6d4cc602291be5762eb93f7f9..12003050af864da7d88d335553d71007cf5ed9c5 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -23,6 +23,7 @@
 #include "kernel_src_map.h"     // NOLINT
 #include "lite/api/cxx_api.h"
 #include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/op_registry.h"
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 1de46a39467af125e705cfcb7a9eeae64a0be133..e48686b913cc5b07f87db0a503ce7081bbe7d058 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -99,7 +99,8 @@ enum class ActivationType : int {
   kTanh = 6,
   kSwish = 7,
   kExp = 8,
-  NUM = 9,
+  kAbs = 9,
+  NUM = 10,
 };
 
 static size_t PrecisionTypeLength(PrecisionType type) {
diff --git a/lite/api/test_classify_lite_bm.cc b/lite/api/test_classify_lite_bm.cc
index 7da7dc03745aa623e35dec5b344e16de03cf5aca..b2507e28adbe050e4715e0c28a433a259607e7a9 100644
--- a/lite/api/test_classify_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
   predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
   auto* data = input_tensor->mutable_data<float>();
   auto item_size = input_tensor->dims().production();
   if (FLAGS_input_img_txt_path.empty()) {
@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
             << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
             << " ms in average.";
 
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  auto* out_data = out->data<float>();
+  auto out = predictor.GetOutputs();
   FILE* fp = fopen("result.txt", "wb");
-  for (int i = 0; i < out->numel(); i++) {
-    fprintf(fp, "%f\n", out_data[i]);
+  for (int i = 0; i < out.size(); i++) {
+    auto* out_data = out[i]->data<float>();
+    for (int j = 0; j < out[i]->numel(); j++) {
+      fprintf(fp, "%f\n", out_data[j]);
+    }
   }
   fclose(fp);
 }
diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc
index 896b47a97fb20e6935764e12fbe9ebd646a4f816..e1c315f4a63ffd3ed8f51fa4b73ac88b50835cab 100644
--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
+#ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
+#endif
 #include <string>
 #include <vector>
 #include "lite/api/cxx_api.h"
diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h
index b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff..78aa689ff767e8a454dec3aa48a97ecefafdbe7a 100644
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
@@ -29,6 +29,7 @@ enum class BinaryOperation {
   kADD = 0,
   kMUL = 1,
   kDIV = 2,
+  kSUB = 3,
 };
 
 template <typename T>
@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x,
   if (type == BinaryOperation::kADD) return x + y;
   if (type == BinaryOperation::kMUL) return x * y;
   if (type == BinaryOperation::kDIV) return x / y;
+  if (type == BinaryOperation::kSUB) return x - y;
 }
 
 template <typename T>
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index f0105e060f03df3e4d49c358cf314730cdd16393..153c0620035377afac065e8049a9ebbc0a6f0c15 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/opencl/cl_context.h"
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -35,8 +36,10 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
   STL::stringstream program_key_ss;
   program_key_ss << file_name << options;
   std::string program_key = program_key_ss.str();
-  auto it = programs_.find(program_key);
-  if (it != programs_.end()) {
+
+  auto &programs = CLRuntime::Global()->programs();
+  auto it = programs.find(program_key);
+  if (it != programs.end()) {
     VLOG(3) << " --- program -> " << program_key << " has been built --- ";
     return *(it->second);
   }
@@ -47,14 +50,15 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
   CLRuntime::Global()->BuildProgram(program.get(), options);
   VLOG(3) << " --- end build program -> " << program_key << " --- ";
 
-  programs_[program_key] = std::move(program);
+  programs[program_key] = std::move(program);
 
-  return *(programs_[program_key]);
+  return *(programs[program_key]);
 }
 
 void CLContext::AddKernel(const std::string &kernel_name,
                           const std::string &file_name,
-                          const std::string &options) {
+                          const std::string &options,
+                          const std::string &time_stamp) {
   cl_int status{CL_SUCCESS};
   VLOG(3) << " --- to get program " << file_name << " --- ";
   auto program = GetProgram(file_name, options);
@@ -64,25 +68,30 @@ void CLContext::AddKernel(const std::string &kernel_name,
       new cl::Kernel(program, kernel_name.c_str(), &status));
   CL_CHECK_FATAL(status);
   VLOG(3) << " --- end create kernel --- ";
-  kernels_.emplace_back(std::move(kernel));
+
+  auto &kernels = CLRuntime::Global()->kernels();
+  auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
+  kernels.emplace_back(std::move(kernel));
   STL::stringstream kernel_key;
-  kernel_key << kernel_name << options;
-  kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
+  kernel_key << kernel_name << options << time_stamp;
+  kernel_offset_map[kernel_key.str()] = kernels.size() - 1;
 }
 
 cl::Kernel &CLContext::GetKernel(const int index) {
-  VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
-  CHECK(static_cast<size_t>(index) < kernels_.size())
+  auto &kernels = CLRuntime::Global()->kernels();
+  VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels.size())
       << "The index must be less than the size of kernels.";
-  CHECK(kernels_[index] != nullptr)
+  CHECK(kernels[index] != nullptr)
       << "The target kernel pointer cannot be null.";
-  return *(kernels_[index]);
+  return *(kernels[index]);
 }
 
 cl::Kernel &CLContext::GetKernel(const std::string &name) {
-  auto it = kernel_offset_.find(name);
-  CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: "
-                                    << name;
+  auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
+  auto it = kernel_offset_map.find(name);
+  CHECK(it != kernel_offset_map.end()) << "Cannot find the kernel function: "
+                                       << name;
   return GetKernel(it->second);
 }
 
@@ -121,14 +130,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
   }
 }
 
+cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+                                         size_t max_work_size,
+                                         int divisor) {
+  int preferred_lws = 0;
+#if 1
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
+#else
+  auto gws2 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws0 = global_work_size[2];
+#endif
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
+  }
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
+  }
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+  }
+#if 1
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
+#else
+  return cl::NDRange{static_cast<size_t>(gws2),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws0)};
+#endif
+}
+
 cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
                                      size_t max_work_size) {
   int preferred_lws = 0;
   int divisor = 2;
 
-  auto tmp0 = global_work_size[0];
-  auto tmp1 = global_work_size[1];
-  auto tmp2 = global_work_size[2];
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
 
   if (divisor > 1) {
     max_work_size /= divisor;
@@ -136,18 +184,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
   if (preferred_lws > 0 && preferred_lws <= max_work_size) {
     max_work_size = preferred_lws;
   }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
   }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
   }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
   }
-  return cl::NDRange{static_cast<size_t>(tmp0),
-                     static_cast<size_t>(tmp1),
-                     static_cast<size_t>(tmp2)};
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
 }
 
 }  // namespace lite
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 1964c4bf56b55841ba735c79b2f7a17dc1ed451e..b12473ccf5b4238f4ee95b7848a0842ee5b2ffe0 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -36,7 +36,8 @@ class CLContext {
 
   void AddKernel(const std::string &kernel_name,
                  const std::string &file_name,
-                 const std::string &options = "");
+                 const std::string &options = "",
+                 const std::string &time_stamp = "");
 
   cl::Kernel &GetKernel(const int index);
 
@@ -46,10 +47,11 @@ class CLContext {
 
   cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
 
- private:
-  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
-  std::vector<std::unique_ptr<cl::Kernel>> kernels_;
-  std::map<std::string, int> kernel_offset_;
+  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+                                size_t max_work_size,
+                                int divitor = 2);
+  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
+  //                                   size_t max_work_size);
 };
 
 }  // namespace lite
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index 63c9954f9181e9252c4d14f57b6ed29107965fe3..8405fc967239e851705feb96f517b3980192ebef 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "lite/backends/opencl/cl_runtime.h"
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/utils/cp_logging.h"
@@ -29,10 +30,26 @@ CLRuntime* CLRuntime::Global() {
 
 CLRuntime::~CLRuntime() {
   if (command_queue_ != nullptr) {
+    command_queue_->flush();
     command_queue_->finish();
   }
-  // For controlling the destruction order:
+
+  for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
+    clReleaseKernel(kernels_[kidx]->get());
+    kernels_[kidx].reset();
+  }
+  kernels_.clear();
+  kernel_offset_.clear();
+
+  for (auto& p : programs_) {
+    clReleaseProgram(p.second->get());
+  }
+  programs_.clear();
+
+  // For controlling the destruction order
+  command_queue_&& clReleaseCommandQueue(command_queue_->get());
   command_queue_.reset();
+  context_&& clReleaseContext(context_->get());
   context_.reset();
   device_.reset();
   platform_.reset();
@@ -73,14 +90,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
   return *command_queue_;
 }
 
-std::unique_ptr<cl::Program> CLRuntime::CreateProgram(
+std::shared_ptr<cl::Program> CLRuntime::CreateProgram(
     const cl::Context& context, std::string file_name) {
   auto cl_file = opencl_kernels_files.find(file_name);
   std::string content(cl_file->second.begin(), cl_file->second.end());
   cl::Program::Sources sources;
   sources.push_back(content);
   auto prog =
-      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
+      std::shared_ptr<cl::Program>(new cl::Program(context, sources, &status_));
   VLOG(4) << "OpenCL kernel file name: " << file_name;
   VLOG(4) << "Program source size: " << content.size();
   CL_CHECK_FATAL(status_);
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 1a5ededeff37d9f6820af6a49dc22c669620734b..36e5d64b906ff5c91b2b5cb5e97855d7dff511c4 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"
@@ -42,7 +43,7 @@ class CLRuntime {
 
   cl::CommandQueue& command_queue();
 
-  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
+  std::shared_ptr<cl::Program> CreateProgram(const cl::Context& context,
                                              std::string file_name);
 
   std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
@@ -57,6 +58,12 @@ class CLRuntime {
 
   std::map<std::string, size_t>& GetDeviceInfo();
 
+  std::unordered_map<std::string, std::shared_ptr<cl::Program>>& programs() {
+    return programs_;
+  }
+  std::vector<std::unique_ptr<cl::Kernel>>& kernels() { return kernels_; }
+  std::map<std::string, int>& kernel_offset() { return kernel_offset_; }
+
  private:
   CLRuntime() = default;
 
@@ -98,6 +105,12 @@ class CLRuntime {
 
   std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
 
+  std::unordered_map<std::string, std::shared_ptr<cl::Program>> programs_{};
+
+  std::vector<std::unique_ptr<cl::Kernel>> kernels_{};
+
+  std::map<std::string, int> kernel_offset_{};
+
   cl_int status_{CL_SUCCESS};
 
   bool initialized_{false};
diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h
index b7f14c15e61ba050220ef0819fa9c3d13a7b8606..de01f896a6eb461eb24023a77935bba07de029e7 100644
--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
         __FILE__,                                                    \
         __LINE__);                                                   \
   }
-
+#ifndef LITE_SHUTDOWN_LOG
 #define CL_CHECK_FATAL(err_code__)                                   \
   if (err_code__ != CL_SUCCESS) {                                    \
     LOG(FATAL) << string_format(                                     \
@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
         __FILE__,                                                    \
         __LINE__);                                                   \
   }
+#else
+#define CL_CHECK_FATAL(err_code__)
+#endif
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/context.h b/lite/core/context.h
index 88fe00d0f2aab41cfd3e5562d29f0a8a82598428..b22b59fbeb6a5e25547e18bcc4f62a263c4f165c 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -181,7 +181,11 @@ class Context<TargetType::kCUDA> {
       Env<TargetType::kCUDA>::Global();
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {
-    cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    if (devs.size() > 0) {
+      cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    } else {
+      LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
+    }
   }
   void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
     CHECK_GT(devs.size(), 0UL)
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index 1ff8b896a70dc538d2486a24db2625c7b62c600a..5727933f477ae76fbfa89c9aa3e03aec8763d445 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -142,7 +142,7 @@ class Env {
     // Get device count
     count = API::num_devices();
     if (count == 0) {
-      CHECK(false) << "No device found!";
+      LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!";
     } else {
       LOG(INFO) << "Found " << count << " device(s)";
     }
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index f5a7837b53650e08f9632b499a4c2ab1faeaeedf..4393832931c95ca20e34ca3b3d2fb4501274b15f 100644
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -26,7 +26,8 @@ namespace mir {
 void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // initialze fuser params
   std::vector<bool> conv_has_bias_cases{true, false};
-  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> conv_type_cases{
+      "conv2d", "depthwise_conv2d", "conv2d_transpose"};
   // start fuse using params
   for (auto conv_has_bias : conv_has_bias_cases) {
     for (auto conv_type : conv_type_cases) {
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 0f5bb64e10dd61c3edf4ddd32569a2d365651cdf..150a6e68d8a924ebfa96fdffb99e28b230689a48 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,10 +103,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
   auto conv_weight_t =
       scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
-  CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-           static_cast<size_t>(conv_weight_t->dims()[0]))
-      << "The BN bias's size should be equal to the size of the first "
-      << "dim size of the conv weights";
+  if (conv_type_ == "conv2d_transpose") {
+    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
+             static_cast<size_t>(conv_weight_t->dims()[1]))
+        << "The BN bias's size should be equal to the size of the first "
+        << "dim size of the conv weights";
+  } else {
+    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
+             static_cast<size_t>(conv_weight_t->dims()[0]))
+        << "The BN bias's size should be equal to the size of the first "
+        << "dim size of the conv weights";
+  }
   size_t weight_num = conv_weight_t->data_size();
   bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
   bool is_weight_quantization =
@@ -153,12 +160,29 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
     // compute new conv_weight for int8
     auto weight_scale =
         conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    for (unsigned int i = 0; i < h; ++i) {
-      weight_scale[i] *= fabsf(alpha_data[i]);
-      if (alpha_data[i] < 0.f) {
-        auto ptr_row = conv_weight_d + i * w;
-        for (unsigned int j = 0; j < w; ++j) {
-          ptr_row[j] *= -1;
+    if (conv_type_ == "conv2d_transpose") {
+      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
+                   conv_weight_t->dims()[3];
+      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
+      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (unsigned int i = 0; i < h; ++i) {
+          weight_scale[i] *= fabsf(alpha_data[i]);
+          if (alpha_data[i] < 0.f) {
+            auto ptr_row = conv_weight_d + k * c_size + i * hw;
+            for (unsigned int j = 0; j < hw; ++j) {
+              ptr_row[j] *= -1;
+            }
+          }
+        }
+      }
+    } else {
+      for (unsigned int i = 0; i < h; ++i) {
+        weight_scale[i] *= fabsf(alpha_data[i]);
+        if (alpha_data[i] < 0.f) {
+          auto ptr_row = conv_weight_d + i * w;
+          for (unsigned int j = 0; j < w; ++j) {
+            ptr_row[j] *= -1;
+          }
         }
       }
     }
@@ -176,9 +200,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   } else {
     // compute new conv_weight
     auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
-      for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
-        conv_weight_d[i * w + j] *= alpha_data[i];
+    if (conv_type_ == "conv2d_transpose") {
+      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
+                   conv_weight_t->dims()[3];
+      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
+      for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
+        for (unsigned int i = 0; i < h; ++i) {
+          auto ptr_row = conv_weight_d + k * c_size + i * hw;
+          for (unsigned int j = 0; j < hw; ++j) {
+            ptr_row[j] *= alpha_data[i];
+          }
+        }
+      }
+    } else {
+      for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
+        for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
+          conv_weight_d[i * w + j] *= alpha_data[i];
+        }
       }
     }
   }
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index ab81f3d809507dd340056c97a39998c908a75dc7..80a033c75f2e23efa091375ee2a9f78e3ff40d71 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     fuser(graph.get());
   }
 
-  // delete quant_dequant_node
-  for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) {
-    fusion::DeleteQuantDequantOpFuser fuser(op_type);
-    fuser(graph.get());
-  }
+  // process quant_dequant_node
+  fusion::DeleteQuantDequantOpFuser dqd_fuser;
+  dqd_fuser(graph.get());
 }
 
 }  // namespace mir
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index 7797864a2e4b75f52fd7da93ea81613a2175f423..a3a98b871fb4b6f8230299cda978b0f1f8faa779 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* output_scale_node = matched.at("output_scale_node");
   auto* output_act_node = matched.at("output_act_node");
 
-  // obtain values, save values and relink node
+  // obtain scale, save attrs and relink node
   int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
   int range = ((1 << (bit_length - 1)) - 1);
   auto* scope = quant_node->stmt()->op()->scope();
@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
                            ->GetMutable<lite::Tensor>();
   float scale_value = scale_tensor->data<float>()[0] / range;
 
+  auto in_act_name = input_act_node->arg()->name;
+  auto out_act_name = output_act_node->arg()->name;
   auto outlinks = output_act_node->outlinks;
   for (auto* quantized_node : outlinks) {
-    auto* op_desc = quantized_node->stmt()->mutable_op_info();
-    op_desc->SetAttr<int>("bit_length", bit_length);
-    op_desc->SetAttr<float>("input_scale", scale_value);
+    // save input scale in quantized op by input argname + index
+    auto op_desc = *quantized_node->stmt()->mutable_op_info();
+    std::string argname;
+    int index;
+    op_desc.GetInputArgname(out_act_name, &argname);
+    op_desc.GetInputIndex(out_act_name, &index);
+    op_desc.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
+                           scale_value);
+    op_desc.SetAttr<float>("input_scale", scale_value);  // save it for now
+    op_desc.SetAttr<int>("bit_length", bit_length);
+    op_desc.UpdateAllInputs(out_act_name, in_act_name);
+    quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
     IR_NODE_LINK_TO(input_act_node, quantized_node)
   }
 
@@ -125,19 +136,18 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
 
-  // obtain input_scale and weight_scale
+  // obtain weight_scale from max_range
   auto* scope = quantized_op->stmt()->op()->scope();
   auto& valid_places = quantized_op->stmt()->op()->valid_places();
   int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
   int range = ((1 << (bit_length - 1)) - 1);
-  float input_scale =
-      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
   float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
   float whole_weight_scale =
       static_cast<float>(range * range) / max_range / range;
-  // max_range = range * range / max(abs(weight))
-  // weight_scale = range * range / (range * range / max(abs(weight))) / range
-  //              = max(abs(weight)) / range
+  // As: max_range = range * range / max(abs(weight))
+  // So: whole_weight_scale
+  //        = range * range / (range * range / max(abs(weight))) / range
+  //        = max(abs(weight)) / range
 
   // set op desc
   cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
     // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
     // be Cout.
     weight_scale_size = quantized_weight_t->dims()[0];
-  } else if (quantized_op_type_ == "mul") {
+  } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
     op_desc.SetInput("X", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
     // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
     weight_scale.push_back(whole_weight_scale);
   }
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("input_scale", input_scale);
   op_desc.SetAttr("weight_scale", weight_scale);
 
   // change the weight from the float type to int8 type.
@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() {
                                ->assert_is_op_output(quantized_op_type_)
                                ->assert_is_op_input(dequant_op_type, "X")
                                ->AsIntermediate();
+  // The scale var_node of input activation is deleted in DeleteQuantOpFuser
   auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale")
                                        ->assert_is_op_input(dequant_op_type)
                                        ->AsIntermediate();
@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
 
-  // obtain input_scale and weight_scale
+  // obtain input weight_scale from fake_dequant op
   auto* scope = quantized_op->stmt()->op()->scope();
   auto& valid_places = quantized_op->stmt()->op()->valid_places();
-  float input_scale =
-      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
 
   std::vector<float> weight_scale;
   std::vector<int> quant_bits =
@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
 
   // set op desc
   cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
-  op_desc.SetInput("Input", {quantized_op_input->arg()->name});
-  op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
-
+  if (quantized_op_type_ == "conv2d" ||
+      quantized_op_type_ == "depthwise_conv2d") {
+    op_desc.SetInput("Input", {quantized_op_input->arg()->name});
+    op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
+  } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
+    op_desc.SetInput("X", {quantized_op_input->arg()->name});
+    op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
+  }
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("input_scale", input_scale);
   op_desc.SetAttr("weight_scale", weight_scale);
 
   // change the weight from the float type to int8 type.
@@ -297,167 +309,65 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 void DeleteQuantDequantOpFuser::BuildPattern() {
   std::string quant_dequant_op_type =
       "fake_quantize_dequantize_moving_average_abs_max";
-  if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
-    auto* input_scale_node =
-        VarNode("input_scale_node")
-            ->assert_is_op_input(quant_dequant_op_type, "InScale");
-    auto* input_act_node = VarNode("input_act_node")
-                               ->assert_is_op_input(quant_dequant_op_type, "X");
-    auto* quant_dequant_node =
-        OpNode("quant_dequant_node", quant_dequant_op_type)
-            ->assert_is_op(quant_dequant_op_type);
-    auto* output_scale_node =
-        VarNode("output_scale_node")
-            ->assert_is_op_output(quant_dequant_op_type, "OutScale");
-    auto* output_act_node =
-        VarNode("output_act_node")
-            ->assert_is_op_output(quant_dequant_op_type, "Out");
-    auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
-                               ->assert_is_op(quantized_op_type_);
-
-    quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
-    output_scale_node->LinksFrom({quant_dequant_node});
-    output_act_node->LinksFrom({quant_dequant_node});
-    quantized_node->LinksFrom({output_act_node});
-  } else if (quantized_op_type_ == "elementwise_add") {
-    auto* input_scale_left_node =
-        VarNode("input_scale_left_node")
-            ->assert_is_op_input(quant_dequant_op_type, "InScale");
-    auto* input_act_left_node =
-        VarNode("input_act_left_node")
-            ->assert_is_op_input(quant_dequant_op_type, "X");
-    auto* quant_dequant_left_node =
-        OpNode("quant_dequant_left_node", quant_dequant_op_type)
-            ->assert_is_op(quant_dequant_op_type);
-    auto* output_scale_left_node =
-        VarNode("output_scale_left_node")
-            ->assert_is_op_output(quant_dequant_op_type, "OutScale");
-    auto* output_act_left_node =
-        VarNode("output_act_left_node")
-            ->assert_is_op_output(quant_dequant_op_type, "Out")
-            ->assert_is_op_input(quantized_op_type_, "X");
-    quant_dequant_left_node->LinksFrom(
-        {input_scale_left_node, input_act_left_node});
-    output_scale_left_node->LinksFrom({quant_dequant_left_node});
-    output_act_left_node->LinksFrom({quant_dequant_left_node});
-
-    auto* input_scale_right_node =
-        VarNode("input_scale_right_node")
-            ->assert_is_op_input(quant_dequant_op_type, "InScale");
-    auto* input_act_right_node =
-        VarNode("input_act_right_node")
-            ->assert_is_op_input(quant_dequant_op_type, "X");
-    auto* quant_dequant_right_node =
-        OpNode("quant_dequant_right_node", quant_dequant_op_type)
-            ->assert_is_op(quant_dequant_op_type);
-    auto* output_scale_right_node =
-        VarNode("output_scale_right_node")
-            ->assert_is_op_output(quant_dequant_op_type, "OutScale");
-    auto* output_act_right_node =
-        VarNode("output_act_right_node")
-            ->assert_is_op_output(quant_dequant_op_type, "Out")
-            ->assert_is_op_input(quantized_op_type_, "Y");
-    quant_dequant_right_node->LinksFrom(
-        {input_scale_right_node, input_act_right_node});
-    output_scale_right_node->LinksFrom({quant_dequant_right_node});
-    output_act_right_node->LinksFrom({quant_dequant_right_node});
-
-    auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
-                               ->assert_is_op(quantized_op_type_);
-    quantized_node->LinksFrom({output_act_left_node, output_act_right_node});
-  } else {
-    LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
-  }
-  VLOG(4) << "DeleteQuantDequantOpFuser BuildPattern op_type:"
-          << quantized_op_type_;
+  auto* input_scale_node =
+      VarNode("input_scale_node")
+          ->assert_is_op_input(quant_dequant_op_type, "InScale");
+  auto* input_act_node =
+      VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
+  auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
+                                 ->assert_is_op(quant_dequant_op_type);
+  auto* output_scale_node =
+      VarNode("output_scale_node")
+          ->assert_is_op_output(quant_dequant_op_type, "OutScale");
+  auto* output_act_node =
+      VarNode("output_act_node")
+          ->assert_is_op_output(quant_dequant_op_type, "Out");
+
+  quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+  output_scale_node->LinksFrom({quant_dequant_node});
+  output_act_node->LinksFrom({quant_dequant_node});
 }
 
 void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
                                               const key2nodes_t& matched) {
-  if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
-    auto* input_scale_node = matched.at("input_scale_node");
-    auto* input_act_node = matched.at("input_act_node");
-    auto* quant_dequant_node = matched.at("quant_dequant_node");
-    auto* output_scale_node = matched.at("output_scale_node");
-    auto* output_act_node = matched.at("output_act_node");
-    auto* quantized_node = matched.at("quantized_node");
-
-    // obtain values, save values and relink node
-    int bit_length =
-        quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length");
-    int range = ((1 << (bit_length - 1)) - 1);
-    auto* scope = quant_dequant_node->stmt()->op()->scope();
-    auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
-                             ->GetMutable<lite::Tensor>();
-    float scale_value = scale_tensor->data<float>()[0] / range;
-
-    auto* op_desc = quantized_node->stmt()->mutable_op_info();
-    op_desc->SetAttr<int>("bit_length", bit_length);
-    op_desc->SetAttr<float>("input_scale", scale_value);
-    op_desc->SetInput("X", {input_act_node->arg()->name});
-    IR_NODE_LINK_TO(input_act_node, quantized_node)
-    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
-    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
-
-    // delete nodes and edges
-    std::unordered_set<const Node*> nodes2rm = {input_scale_node,
-                                                quant_dequant_node,
-                                                output_scale_node,
-                                                output_act_node};
-    GraphSafeRemoveNodes(graph, nodes2rm);
-  } else if (quantized_op_type_ == "elementwise_add") {
-    auto* input_scale_left_node = matched.at("input_scale_left_node");
-    auto* input_act_left_node = matched.at("input_act_left_node");
-    auto* quant_dequant_left_node = matched.at("quant_dequant_left_node");
-    auto* output_scale_left_node = matched.at("output_scale_left_node");
-    auto* output_act_left_node = matched.at("output_act_left_node");
-
-    auto* input_scale_right_node = matched.at("input_scale_right_node");
-    auto* input_act_right_node = matched.at("input_act_right_node");
-    auto* quant_dequant_right_node = matched.at("quant_dequant_right_node");
-    auto* output_scale_right_node = matched.at("output_scale_right_node");
-    auto* output_act_right_node = matched.at("output_act_right_node");
-
-    auto* quantized_node = matched.at("quantized_node");
-
-    // obtain values, save values and relink node
-    int bit_length =
-        quant_dequant_left_node->stmt()->op_info()->GetAttr<int>("bit_length");
-    int range = ((1 << (bit_length - 1)) - 1);
-    auto* scope = quant_dequant_left_node->stmt()->op()->scope();
-    auto* left_scale_tensor =
-        scope->FindVar(output_scale_left_node->arg()->name)
-            ->GetMutable<lite::Tensor>();
-    float left_scale_value = left_scale_tensor->data<float>()[0] / range;
-    auto* right_scale_tensor =
-        scope->FindVar(output_scale_right_node->arg()->name)
-            ->GetMutable<lite::Tensor>();
-    float right_scale_value = right_scale_tensor->data<float>()[0] / range;
-
-    auto* op_desc = quantized_node->stmt()->mutable_op_info();
-    op_desc->SetAttr<int>("bit_length", bit_length);
-    op_desc->SetAttr<float>("x_input_scale", left_scale_value);
-    op_desc->SetAttr<float>("y_input_scale", right_scale_value);
-    op_desc->SetInput("X", {input_act_left_node->arg()->name});
-    op_desc->SetInput("Y", {input_act_right_node->arg()->name});
-    IR_NODE_LINK_TO(input_act_left_node, quantized_node)
-    IR_NODE_LINK_TO(input_act_right_node, quantized_node)
-    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
-    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
-
-    // delete nodes and edges
-    std::unordered_set<const Node*> nodes2rm = {input_scale_left_node,
-                                                quant_dequant_left_node,
-                                                output_scale_left_node,
-                                                output_act_left_node,
-                                                input_scale_right_node,
-                                                quant_dequant_right_node,
-                                                output_scale_right_node,
-                                                output_act_right_node};
-    GraphSafeRemoveNodes(graph, nodes2rm);
-  } else {
-    LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
+  auto* input_scale_node = matched.at("input_scale_node");
+  auto* input_act_node = matched.at("input_act_node");
+  auto* quant_dequant_node = matched.at("quant_dequant_node");
+  auto* output_scale_node = matched.at("output_scale_node");
+  auto* output_act_node = matched.at("output_act_node");
+  auto input_act_name = input_act_node->arg()->name;
+  auto output_act_name = output_act_node->arg()->name;
+
+  // Get scale value from scale var node
+  int bit_length =
+      quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length");
+  int range = ((1 << (bit_length - 1)) - 1);
+  auto* scope = quant_dequant_node->stmt()->op()->scope();
+  auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float scale_value = scale_tensor->data<float>()[0] / range;
+
+  auto quantized_nodes = output_act_node->outlinks;
+  for (auto* quantized_node : quantized_nodes) {
+    // Save quantization info in op_info attr
+    auto op_info = *quantized_node->stmt()->op_info();
+    std::string argname;
+    int index;
+    op_info.GetInputArgname(output_act_name, &argname);
+    op_info.GetInputIndex(output_act_name, &index);
+    op_info.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
+                           scale_value);
+    op_info.SetAttr<float>("input_scale", scale_value);  // Save it for now
+    op_info.SetAttr<int>("bit_length", bit_length);
+
+    op_info.UpdateAllInputs(output_act_name, input_act_name);
+    quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
+    IR_NODE_LINK_TO(input_act_node, quantized_node);
   }
+  // delete nodes and edges
+  std::unordered_set<const Node*> nodes2rm = {
+      input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
+  GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
 cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
index bef9f4d9573d049700736c166cd0d31b668f7eff..ac3ac112b3aa504bc075125f2f13292073ca9444 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase {
 };
 
 /* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
- * pooled/elementwise_add" can be deteted by this fuser. The fuser
- * extract the input_scale form fake_quant_dequant_op and save into
- * the quantized_op. Besides, the fuser delete fake_quant_dequant_op in
- * the graph.
+ * quantized_op" can be deteted by this fuser. The fuser modifies the input
+ * scale for the quantized_op and deletes the fake_quant_dequant_op.
 */
-
 class DeleteQuantDequantOpFuser : public FuseBase {
  public:
-  explicit DeleteQuantDequantOpFuser(const std::string& quantized_op_type)
-      : quantized_op_type_(quantized_op_type) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-
- private:
-  std::string quantized_op_type_{};
 };
 
 }  // namespace fusion
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 4c6c66be7e41889c116aed023d863df8a4a912c8..1cdc33825cb4ffb758b46ac4b9bee968b3fca055 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -225,6 +225,32 @@ class OpInfo : public cpp::OpDesc {
     return false;
   }
 
+  // For the input variable name, find the index of the corresponding
+  // input argname
+  bool GetInputIndex(const std::string &value_name, int *out) const {
+    for (auto &item : inputs_) {
+      auto it = std::find(item.second.begin(), item.second.end(), value_name);
+      if (it != item.second.end()) {
+        *out = it - item.second.begin();
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // For the output variable name, find the index of the corresponding
+  // output argname
+  bool GetOutputIndex(const std::string &value_name, int *out) const {
+    for (auto &item : outputs_) {
+      auto it = std::find(item.second.begin(), item.second.end(), value_name);
+      if (it != item.second.end()) {
+        *out = it - item.second.begin();
+        return true;
+      }
+    }
+    return false;
+  }
+
   void UpdateAllInputs(const std::string &from, const std::string &to) {
     for (auto &item : inputs_) {
       for (auto &var : item.second) {
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index 39213a33cebd05d9cfa50d82cdfb09ad3f7ad637..e72d1f54ee858ef10de83ceefb49addae6ea6606 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -18,6 +18,7 @@
  * of each kernel.
  */
 #pragma once
+#include <cmath>
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 7284c3983cb34a0db2387ece40f6d07b9d9a8511..ff900c0e23be9a06313babba51e3ce364295231a 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -20,7 +20,7 @@
 #include "lite/operators/conditional_block_op.h"
 #include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
-#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
 #include "lite/core/profile/precision_profiler.h"
 #endif
 
@@ -136,12 +136,10 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }
 
 void RuntimeProgram::Run() {
-#ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
   auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
   std::string precision_profiler_summary =
       inst_precision_profiler.GetSummaryHeader();
-#endif
 #endif
 
   for (auto& inst : instructions_) {
@@ -149,21 +147,19 @@ void RuntimeProgram::Run() {
     if (inst.is_feed_fetch_op()) continue;
 #endif
     inst.Run();
-#ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
     precision_profiler_summary +=
         inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
   }
 #ifdef LITE_WITH_PROFILE
   LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#endif
 #ifdef LITE_WITH_PRECISION_PROFILE
   LOG(INFO) << "\n" << precision_profiler_summary;
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
+#endif
 }
 
 void Program::Build(const cpp::ProgramDesc& prog) {
diff --git a/lite/demo/cxx/train_demo/README.md b/lite/demo/cxx/train_demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..56f4513d45676a1deb51bfb93096db156ddd0449
--- /dev/null
+++ b/lite/demo/cxx/train_demo/README.md
@@ -0,0 +1,191 @@
+
+# Introduction
+  我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+  
+  你可以通过book库中的
+[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
+和
+[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
+进一步了解“波士顿房价预测”这一任务的定义及其建模过程，
+其使用线性回归（Linear Regression）
+模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
+
+注：这是一篇使用C++ API做模型训练的教程，其他API暂时不支持训练功能。
+
+# Requirements
+
+- 一部安卓手机，用于运行训练程序
+- 装了Paddle (version: 1.7.0) 的python
+
+# Quick start
+
+## Step1 build paddle-lite
+
+请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
+
+```shell
+## 配置环境
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz --no-check-certificate
+tar xzf cmake-3.10.3-Linux-x86_64.tar.gz
+export PATH=${PWD}'/cmake-3.10.3-Linux-x86_64/bin':$PATH
+
+wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+unzip android-ndk-r17c-linux-x86_64.zip
+export NDK_ROOT=/opt/android-ndk-r17c
+
+## 编译
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv7 \
+  --build_extra=ON \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_train=ON full_publish
+```
+
+产物:
+
+```shell
+Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
+```
+
+## Step2 编译lr_trainer
+
+```shell
+cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
+sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c
+```
+
+产物:
+```shell
+bin/
+`-- demo_trainer
+```
+
+## Step3 download model and run it!
+
+在你的笔记本电脑上，用usb连接到手机，开启开发者模式，在任意目录下执行：
+
+```shell
+local_path=/data/local/tmp/linear_regression
+adb shell "mkdir "${local_path}
+
+# download model and push to mobile
+wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz
+tar -zxvf lite_lr_model.tar.gz
+adb push lite_lr_model/housing.data ${local_path}
+adb push lite_lr_model/model_dir ${local_path}
+
+# push lib and executable file to moblie
+adb push libpaddle_full_api_shared.so ${local_path}
+adb push demo_trainer ${local_path}
+adb shell chmod +x ${local_path}/demo_trainer
+
+# run it!
+adb shell "export LD_LIBRARY_PATH="${local_path}" && export LIBRARY_PATH="${local_path}" && cd "${local_path}" && ./demo_trainer true"
+```
+
+期望结果：
+
+```
+sample 0: Loss: 564.317
+sample 1: Loss: 463.9
+sample 2: Loss: 1197.54
+sample 3: Loss: 1093.83
+sample 4: Loss: 1282.76
+sample 5: Loss: 792.097
+sample 6: Loss: 491.776
+sample 7: Loss: 698.496
+sample 8: Loss: 248.445
+sample 9: Loss: 325.135
+```
+
+# 更多细节
+上面提到的模型是直接下载得到的，如果你想自己生成，可以执行以下命令：
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite/lite/demo/cxx/train_demo/
+python train.py --save_model
+```
+
+产物：
+
+```shell
+model_dir/
+|-- fc_0.b_0
+|-- fc_0.w_0
+|-- learning_rate_0
+`-- __model__
+
+md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
+```
+
+如果你想生成自己的模型用于训练，可以参考`train.py`中保存模型的方式。
+
+# 与Paddle训练结果做校对
+
+## 前10个Loss值
+
+为了验证paddle与lite的一致性，我们控制模型参数一致、数据一致、batch size = 1的情况下，训练10个batch， 记录了二者的loss值。
+
+python + paddle 命令:
+
+```shell
+  fluid train.py --num_steps=10 --batch_size=1
+```
+
+python + paddle 结果:
+
+```shell
+Train cost, Step 0, Cost 564.317017
+Train cost, Step 1, Cost 463.900238
+Train cost, Step 2, Cost 1197.537354
+Train cost, Step 3, Cost 1093.833008
+Train cost, Step 4, Cost 1282.760254
+Train cost, Step 5, Cost 792.097351
+Train cost, Step 6, Cost 491.775848
+Train cost, Step 7, Cost 698.496033
+Train cost, Step 8, Cost 248.444885
+Train cost, Step 9, Cost 325.135132
+```
+
+c++ 与 paddle-lite命令：
+```
+./demo_trainer true
+```
+
+c++ 与 paddle-lite结果：
+```
+sample 0: Loss: 564.317
+sample 1: Loss: 463.9
+sample 2: Loss: 1197.54
+sample 3: Loss: 1093.83
+sample 4: Loss: 1282.76
+sample 5: Loss: 792.097
+sample 6: Loss: 491.776
+sample 7: Loss: 698.496
+sample 8: Loss: 248.445
+sample 9: Loss: 325.135
+```
+
+## Loss 曲线
+
+控制训练时的batch size为20，每个epoch对训练数据做全局shuffle，训练100个epoch后，paddle和lite的loss曲线对比如下。
+
+![lr_loss](image/lr_loss.png)
+
+如果想复现上述效果，paddle+python的运行命令为：
+
+```
+git clone https://github.com/PaddlePaddle/book.git
+cd book/01.fit_a_line
+python train.py
+```
+
+lite + c++的运行命令为：
+```
+./demo_trainer false
+```
diff --git a/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b41808352a186e8ed434c0cf9364a9cae7d3928e
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 2.8)
+set (CMAKE_CXX_STANDARD 11)
+
+# Project's name
+
+if(NOT DEFINED LITE_ROOT)
+  message(FATAL_ERROR "please set LITE_ROOT with
+                 -DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/")
+endif()
+
+project(demo_trainer)
+# Set the output folder where your program will be created
+set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+# The following folder will be included
+include_directories("include")
+include_directories("${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/include")
+
+add_executable(demo_trainer ${PROJECT_SOURCE_DIR}/demo_trainer.cc ${PROJECT_SOURCE_DIR}/data_reader.cc)
+
+TARGET_LINK_LIBRARIES(demo_trainer
+"${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so")
diff --git a/lite/demo/cxx/train_demo/cplus_train/data_reader.cc b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4546e2e5fecc17321e8126485022b4ac30876747
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/data_reader.h"
+#include <limits>
+
+using std::string;
+using std::vector;
+
+int FEATURE_NUM = 13;
+float rate = 0.8;
+
+int get_samples(string line, vector<float>* feature, float* label) {
+  std::istringstream reader(line);
+  std::vector<float> numbers;
+  do {
+    // read as many numbers as possible.
+    for (float number; reader >> number;) {
+      numbers.push_back(number);
+    }
+    // consume and discard token from stream.
+    if (reader.fail()) {
+      reader.clear();
+      std::string token;
+      reader >> token;
+    }
+  } while (!reader.eof());
+
+  assert(numbers.size() == FEATURE_NUM + 1);
+  for (int i = 0; i < FEATURE_NUM; i++) {
+    feature->push_back(numbers[i]);
+  }
+  *label = numbers[FEATURE_NUM];
+  return 0;
+}
+
+int normalize(const vector<vector<float>>& origin_features,
+              vector<vector<float>>* features,
+              float rate) {
+  int inf = std::numeric_limits<int>::max();
+  vector<float> min_vec(FEATURE_NUM, static_cast<float>(inf));
+  vector<float> max_vec(FEATURE_NUM, -(static_cast<float>(inf)));
+  vector<float> sum_vec(FEATURE_NUM, 0);
+  vector<float> avg_vec(FEATURE_NUM, 0);
+
+  for (int i = 0; i < origin_features.size(); i++) {
+    for (int j = 0; j < FEATURE_NUM; j++) {
+      min_vec[j] = min(min_vec[j], origin_features[i][j]);
+      max_vec[j] = max(max_vec[j], origin_features[i][j]);
+      sum_vec[j] += origin_features[i][j];
+    }
+  }
+
+  for (int i = 0; i < FEATURE_NUM; i++) {
+    avg_vec[i] = sum_vec[i] / origin_features.size();
+  }
+
+  for (int i = 0; i < origin_features.size() * rate - 1; i++) {
+    vector<float> feat;
+    for (int j = 0; j < FEATURE_NUM; j++) {
+      feat.push_back((origin_features[i][j] - avg_vec[j]) /
+                     (max_vec[j] - min_vec[j]));
+    }
+    features->push_back(feat);
+  }
+}
+
+int read_samples(const string fname,
+                 vector<vector<float>>* features,
+                 vector<float>* labels) {
+  fstream fin;
+  fin.open(fname);
+  if (!static_cast<bool>(fin)) {
+    return 1;
+  }
+  vector<vector<float>> origin_features;
+  vector<string> lines;
+  string line;
+  while (getline(fin, line)) {
+    lines.push_back(line);
+  }
+  fin.close();
+
+  for (int i = 0; i < lines.size(); i++) {
+    vector<float> feat;
+    float lbl = 0;
+    get_samples(lines[i], &feat, &lbl);
+    origin_features.push_back(feat);
+    if (i < lines.size() * rate - 1) {
+      labels->push_back(lbl);
+    }
+  }
+
+  cout << "finish read fata" << endl;
+  normalize(origin_features, features, rate);
+  assert(features->size() == labels->size());
+  return 0;
+}
diff --git a/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f035078fff35c4b2c0b41d0de84d2621c550d14e
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "include/data_reader.h"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+class LRModel {
+ public:
+  void InitModel() {
+    // 1. Set CxxConfig
+    CxxConfig config;
+    config.set_model_dir("model_dir");
+    std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+    config.set_valid_places(valid_places);
+    predictor_ = CreatePaddlePredictor<CxxConfig>(config);
+  }
+
+  float Predict(const vector<vector<float>>& features,
+                const vector<float>& labels) {
+    // Create Tensor
+    assert(features.size() == labels.size());
+    int batch_size = features.size();
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor_->GetInput(0)));
+    input_tensor->Resize(shape_t({batch_size, FEATURE_NUM}));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int i = 0; i < batch_size; i++) {
+      for (int j = 0; j < FEATURE_NUM; j++) {
+        data[FEATURE_NUM * i + j] = features[i][j];
+      }
+    }
+    std::unique_ptr<Tensor> y_tensor(std::move(predictor_->GetInput(1)));
+    y_tensor->Resize(shape_t({batch_size, 1}));
+    auto* y_data = y_tensor->mutable_data<float>();
+    for (int i = 0; i < batch_size; i++) {
+      y_data[i] = labels[i];
+    }
+    predictor_->Run();
+    std::unique_ptr<const Tensor> output_tensor(
+        std::move(predictor_->GetOutput(0)));
+    return output_tensor->data<float>()[0];
+  }
+
+ private:
+  std::shared_ptr<PaddlePredictor> predictor_;
+};
+
+int shuffle(vector<vector<float>>* features, vector<float>* labels) {
+  assert(features->size() == labels->size());
+  vector<int> index;
+  for (int i = 0; i < features->size(); i++) {
+    index.push_back(i);
+  }
+  random_shuffle(index.begin(), index.end());
+
+  vector<vector<float>> tmp_features;
+  vector<float> tmp_labels;
+
+  for (int i = 0; i < features->size(); i++) {
+    tmp_features.push_back((*features)[index[i]]);
+    tmp_labels.push_back((*labels)[index[i]]);
+  }
+
+  for (int i = 0; i < features->size(); i++) {
+    for (int j = 0; j < FEATURE_NUM; j++) {
+      (*features)[i][j] = tmp_features[i][j];
+    }
+    (*labels)[i] = tmp_labels[i];
+  }
+  return 0;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 2) {
+    cerr << "usage: ./demo_trainer is_small" << endl;
+    cerr << "       if is_small is true, the batch size is set to 1, " << endl;
+    cerr << "       and it will only runs for 10 steps." << endl;
+    return 1;
+  }
+  string is_small = argv[1];
+  vector<vector<float>> features;
+  vector<float> labels;
+  read_samples("housing.data", &features, &labels);
+  cout << "sample count: " << features.size() << " " << endl;
+
+  std::shared_ptr<LRModel> local_model(new LRModel());
+  local_model->InitModel();
+
+  if (is_small == "true") {
+    cout << "small mode" << endl;
+    for (int i; i < 10; i++) {
+      vector<vector<float>> batch_feature;
+      vector<float> batch_label;
+      batch_feature.push_back(features[i]);
+      batch_label.push_back(labels[i]);
+      auto loss = local_model->Predict(batch_feature, batch_label);
+      cout << "sample " << i << ": " << loss << endl;
+    }
+  } else if (is_small == "false") {
+    // shuffle
+    cout << "full model" << endl;
+    int epoch = 100;
+    int batch_size = 20;
+    int step = 0;
+    for (int i; i < epoch; i++) {
+      shuffle(&features, &labels);
+      for (int j = 0;
+           j < ceil(static_cast<float>(features.size()) / batch_size);
+           j++) {
+        int start_idx = j * batch_size;
+        int end_idx =
+            min((j + 1) * batch_size, static_cast<int>(features.size()));
+        auto batch_feature = vector<vector<float>>(features.begin() + start_idx,
+                                                   features.begin() + end_idx);
+        auto batch_label =
+            vector<float>(labels.begin() + start_idx, labels.begin() + end_idx);
+        auto loss = local_model->Predict(batch_feature, batch_label);
+        if (step % 10 == 0) {
+          std::cout << "batch: " << i << ", step: " << step
+                    << ", Loss: " << loss << endl;
+        }
+        step += 1;
+      }
+    }
+  } else {
+    cerr << "wrong arg for is_small: " << is_small << endl;
+  }
+}
diff --git a/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..050e929c9135ac939dac747e2e4a2490397a4c3d
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <assert.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::min;
+using std::max;
+using std::fstream;
+
+extern int FEATURE_NUM;
+
+int get_samples(string line, const vector<float>& feature, float* label);
+int read_samples(const string fname,
+                 vector<vector<float>>* features,
+                 vector<float>* labels);
diff --git a/lite/demo/cxx/train_demo/cplus_train/run_build.sh b/lite/demo/cxx/train_demo/cplus_train/run_build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4fb444ebd1ecda40db2d69c24016cb78bacdc0ad
--- /dev/null
+++ b/lite/demo/cxx/train_demo/cplus_train/run_build.sh
@@ -0,0 +1,21 @@
+
+rm -rf build
+mkdir build
+cd build
+
+LITE_ROOT=$1
+NDK_ROOT=$2
+
+
+cmake .. \
+         -DLITE_ROOT=${LITE_ROOT} \
+         -DNDK_ROOT=${NDK_ROOT} \
+         -DCMAKE_TOOLCHAIN_FILE=${NDK_ROOT}/build/cmake/android.toolchain.cmake \
+         -DANDROID_TOOLCHAIN=gcc \
+         -DANDROID_ABI="armeabi-v7a" \
+         -DANDROID_PLATFORM=android-23 \
+         -DANDROID=true \
+         -DANDROID_STL=c++_static
+make
+cd ..
+# ./bin/demo_trainer
diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/lite/demo/cxx/train_demo/image/lr_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..626cb57ecd5d4cf50fd4d0b8aaadcc29146ca19b
Binary files /dev/null and b/lite/demo/cxx/train_demo/image/lr_loss.png differ
diff --git a/lite/demo/cxx/train_demo/train.py b/lite/demo/cxx/train_demo/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..37825a5cc472990664f68cb38dbf7ee7859286b8
--- /dev/null
+++ b/lite/demo/cxx/train_demo/train.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import argparse
+
+import math
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("fit_a_line")
+    parser.add_argument(
+        '--save_model',
+        action='store_true',    
+        help="Whether to save main program")
+    parser.add_argument(
+        '--num_steps',
+        type=int, 
+        default=1000000000000,
+        help="train steps")
+    parser.add_argument(
+        '--num_epochs', type=int, default=100, help="number of epochs.")
+    parser.add_argument(
+        '--batch_size', type=int, default=20, help="batch size.")
+    parser.add_argument(
+        '--shuffle',
+        action='store_true',
+        help="Whether to shuffle train data.")
+    args = parser.parse_args()
+    return args
+
+# For training test cost
+def train_test(executor, program, reader, feeder, fetch_list):
+    accumulated = 1 * [0]
+    count = 0
+    for data_test in reader():
+        outs = executor.run(
+            program=program, feed=feeder.feed(data_test), fetch_list=fetch_list)
+        accumulated = [x_c[0] + x_c[1][0] for x_c in zip(accumulated, outs)]
+        count += 1
+    return [x_d / count for x_d in accumulated]
+
+
+def main():
+    if args.shuffle:
+        print("doing shuffle")
+        train_reader = paddle.batch(
+                         paddle.reader.shuffle(
+                             paddle.dataset.uci_housing.train(), buf_size=500),
+                         batch_size=args.batch_size)
+    else:
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=args.batch_size)
+    
+    # feature vector of length 13
+    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+
+    main_program = fluid.default_main_program()
+    startup_program = fluid.default_startup_program()
+
+    main_program.random_seed = 90
+    startup_program.random_seed = 90
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_loss = fluid.layers.mean(cost)
+
+    test_program = main_program.clone(for_test=True)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    num_epochs = args.num_epochs
+
+    # main train loop.
+    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+    exe.run(startup_program)
+    if args.save_model:
+        fluid.io.save_persistables(exe, "model_dir")
+
+        # add feed and fetch op
+        feeded_var_names = ['x', 'y']
+        fetch_var_names = ['mean_0.tmp_0']
+        fluid.io.prepend_feed_ops(main_program, feeded_var_names)
+        fluid.io.append_fetch_ops(main_program, fetch_var_names)
+        with open("model_dir/__model__", "wb") as f:
+            f.write(main_program.desc.serialize_to_string())
+
+        with open("debug_main_program", "w") as f:
+            f.write(str(main_program))
+        print("train model saved to model_dir")
+        return
+
+    train_prompt = "Train cost"
+    step = 0 
+    for pass_id in range(num_epochs):
+        for data_train in train_reader():
+            avg_loss_value, = exe.run(
+                main_program,
+                feed=feeder.feed(data_train),
+                fetch_list=[avg_loss])
+            print("%s, Step %d, Cost %f" %
+                      (train_prompt, step, avg_loss_value[0]))
+            if step  == args.num_steps - 1:
+                return
+            step += 1
+
+            if math.isnan(float(avg_loss_value[0])):
+                sys.exit("got NaN loss, training failed.")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main()
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 7550d770145d92ebd343f96a82c6f34d72c91ea5..a3b1c3680e283a4425fe22209c443ce7cd958267 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -106,13 +106,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
 
 # 4. training kernels
 add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
-if(LITE_WITH_TRAIN)
-  add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-  add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
-endif()
+
+add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt
index 75375f493fe9b6b1f436ef679a7ea8bd80e5ad0a..fe5c39380ef440a98cf0544177bb781644344eb2 100644
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -30,6 +30,8 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D
 lite_cc_library(subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fill_constant_op_bm SRCS fill_constant_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS ${bm_subgraph_bridge_deps})
 
 set(bm_subgraph_bridges
         subgraph_bridge_registry
@@ -58,4 +60,6 @@ set(bm_subgraph_bridges
         subgraph_bridge_reduce_full_op_bm
         subgraph_bridge_squeeze_op_bm
         subgraph_bridge_cast_op_bm
+        subgraph_bridge_fill_constant_op_bm
+        subgraph_bridge_assign_value_op_bm
         CACHE INTERNAL "bm_subgraph_bridges")
diff --git a/lite/kernels/bm/bridges/act_op.cc b/lite/kernels/bm/bridges/act_op.cc
index 091743157995ab1a00e798a6ac560454d4b22ae7..1739dd4185ebcff6a35e2f75c5f8c84ceebd2f0a 100644
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
 #include <bmcompiler_op_code.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
   auto output_dims = output->dims();
-  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
-  const int64_t* output_shape_data =
-      const_cast<const int64_t*>(&output_dims.data()[0]);
+  bool x_is_const = !graph->HasNode(x_var_name);
   std::vector<int32_t> i_x_shape_data(x_dims.size());
   std::vector<int32_t> i_output_shape_data(output_dims.size());
   for (size_t i = 0; i < x_dims.size(); i++) {
-    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+    i_x_shape_data[i] = x_dims[i];
   }
   for (size_t i = 0; i < output_dims.size(); i++) {
-    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    i_output_shape_data[i] = output_dims[i];
   }
   float alpha = 0.f;
   int active_type_id = 0;
@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     LOG(FATAL) << "[BM] unsupport act type";
     return FAILED;
   }
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  if (x_is_const) {
+    bm_add_const_tensor(graph->GetCompilerHandle(),
+                        static_cast<const char*>(x_var_name.c_str()),
+                        const_cast<const int*>(&i_x_shape_data[0]),
+                        x_dims.size(),
+                        static_cast<bm_data_type_t>(DTYPE_FP32),
+                        static_cast<const void*>(x_data));
+  }
   if (op_type == "relu" || op_type == "leaky_relu") {
     add_relu_layer(graph->GetCompilerHandle(),
                    const_cast<const int*>(&i_x_shape_data[0]),
diff --git a/lite/kernels/bm/bridges/assign_value_op.cc b/lite/kernels/bm/bridges/assign_value_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c59cdf29def68c58cd33ee44b24816e0dbccd32e
--- /dev/null
+++ b/lite/kernels/bm/bridges/assign_value_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int AssignValueConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  int buffer_size = 1;
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_dims[i]);
+    buffer_size *= i_output_shape_data[i];
+  }
+  auto fp32_values = op_info->GetAttr<std::vector<float>>("fp32_values");
+  float* assign_data =
+      reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
+  CHECK(assign_data != nullptr);
+  CHECK_EQ(buffer_size, fp32_values.size());
+
+  bm_add_const_tensor(graph->GetCompilerHandle(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      const_cast<const int*>(i_output_shape_data.data()),
+                      output_dims.size(),
+                      static_cast<bm_data_type_t>(DTYPE_FP32),
+                      reinterpret_cast<const void*>(assign_data));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(assign_value,
+                         kBM,
+                         paddle::lite::subgraph::bm::AssignValueConverter);
diff --git a/lite/kernels/bm/bridges/conv_op.cc b/lite/kernels/bm/bridges/conv_op.cc
index e4dff107024c02dcfe25afe37723b7d2418369b5..2a0903191b82bbc1c409f59f2eb19bd2ffd5ddac 100644
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto filter_var_name = op_info->Input("Filter").front();
   auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
   auto filter_dims = filter->dims();
+
   CHECK_EQ(input_dims.size(), 4);
   CHECK_EQ(output_dims.size(), 4);
   CHECK_EQ(filter_dims.size(), 4);
@@ -90,6 +91,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                  dilations[1],
                  static_cast<int>(has_bias));
   graph->AddNode(output_var_name);
+  LOG(INFO) << output_var_name << input_dims << " " << output_dims;
   return SUCCESS;
 }
 
diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc
index 3006a8b6fdaef5a250af1b2e764aff9f2913898e..de5a62dae068620c378cd566abfde556c0f63102 100644
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -65,6 +65,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output_dims = output->dims();
   const int64_t* output_shape_data =
       const_cast<const int64_t*>(&output_dims.data()[0]);
+  LOG(INFO) << x_dims << " " << output_dims;
   std::vector<int32_t> i_output_shape_data(output_dims.size());
   for (size_t i = 0; i < output_dims.size(); i++) {
     i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
diff --git a/lite/kernels/bm/bridges/fill_constant_op.cc b/lite/kernels/bm/bridges/fill_constant_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..835ccf0eb4e72ad945b7a24643190ff49f0b5723
--- /dev/null
+++ b/lite/kernels/bm/bridges/fill_constant_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int FillConstantConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  int buffer_size = 1;
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_dims[i]);
+  }
+  float* const_data =
+      reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
+  CHECK(const_data != nullptr);
+  auto value = op_info->GetAttr<float>("value");
+  for (size_t i = 0; i < buffer_size; i++) {
+    const_data[i] = value;
+  }
+  bm_add_const_tensor(graph->GetCompilerHandle(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      const_cast<const int*>(i_output_shape_data.data()),
+                      output_dims.size(),
+                      static_cast<bm_data_type_t>(DTYPE_FP32),
+                      reinterpret_cast<const void*>(const_data));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fill_constant,
+                         kBM,
+                         paddle::lite::subgraph::bm::FillConstantConverter);
diff --git a/lite/kernels/bm/bridges/mul_op.cc b/lite/kernels/bm/bridges/mul_op.cc
index 06ec177bceb883758c42d45c9b07006a83b3c9f6..35e1aac7660683ff5544a7d72574167359b29fdb 100644
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
   auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
-  // only support y is const
   // input
   auto x_var_name = op_info->Input("X").front();
   auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto y_var_name = op_info->Input("Y").front();
   auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
   auto y_dims = y->dims();
+  bool y_is_const = !graph->HasNode(y_var_name);
+  CHECK_EQ(y_dims.size(), 2);
+  int i_y_shape_data[2];
+  for (size_t i = 0; i < 2; i++) {
+    i_y_shape_data[i] = y_dims[i];
+  }
   // output
   auto output_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
@@ -71,20 +76,39 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   for (size_t i = 0; i < output_dims.size(); i++) {
     i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
   }
-  add_fc_layer(graph->GetCompilerHandle(),
-               const_cast<const int*>(&i_x_reshape_shape_data[0]),
-               2,
-               static_cast<const char*>(unique_op_reshape_name.c_str()),
-               const_cast<const int*>(&i_output_shape_data[0]),
-               output_dims.size(),
-               static_cast<const char*>(output_var_name.c_str()),
-               static_cast<const char*>(unique_op_name.c_str()),
-               i_x_reshape_shape_data[1],
-               i_output_shape_data[1],
-               static_cast<const float*>(y->mutable_data<float>()),
-               nullptr,
-               0,
-               0);
+  if (y_is_const) {
+    add_fc_layer(graph->GetCompilerHandle(),
+                 const_cast<const int*>(&i_x_reshape_shape_data[0]),
+                 2,
+                 static_cast<const char*>(unique_op_reshape_name.c_str()),
+                 const_cast<const int*>(&i_output_shape_data[0]),
+                 output_dims.size(),
+                 static_cast<const char*>(output_var_name.c_str()),
+                 static_cast<const char*>(unique_op_name.c_str()),
+                 i_x_reshape_shape_data[1],
+                 i_output_shape_data[1],
+                 static_cast<const float*>(y->mutable_data<float>()),
+                 nullptr,
+                 0,
+                 0);
+  } else {
+    add_fc_weight_layer(
+        graph->GetCompilerHandle(),
+        const_cast<const int*>(&i_x_reshape_shape_data[0]),
+        2,
+        static_cast<const char*>(unique_op_reshape_name.c_str()),
+        const_cast<const int*>(&i_output_shape_data[0]),
+        output_dims.size(),
+        static_cast<const char*>(output_var_name.c_str()),
+        static_cast<const char*>(unique_op_name.c_str()),
+        const_cast<const int*>(&i_y_shape_data[0]),
+        2,
+        static_cast<const char*>(y_var_name.c_str()),
+        i_x_reshape_shape_data[1],
+        nullptr,
+        0,
+        0);
+  }
   graph->AddNode(output_var_name);
   return SUCCESS;
 }
diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h
index 8dbbb53d810952743228d96d60d7927965d2d527..48b708fed4889a9a0f515d2b9d76232cfb532be5 100644
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
 USE_SUBGRAPH_BRIDGE(squeeze, kBM);
 USE_SUBGRAPH_BRIDGE(squeeze2, kBM);
 USE_SUBGRAPH_BRIDGE(cast, kBM);
+USE_SUBGRAPH_BRIDGE(fill_constant, kBM);
+USE_SUBGRAPH_BRIDGE(assign_value, kBM);
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index 338939f019fb8da37d0b0a234e2c8b408e5a9ad0..45f9bd16c433ef94daf242eae9c2168ac1424147 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   graph.CreateCompilerHandle();
   auto& ctx = this->ctx_->template As<BMContext>();
   for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 3fb3136bfc0787f9d8e539039811d25559919f4e..0fb3c2ea7aa66b313411ac9d97c9918eb2ca8d2f 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -8,6 +8,8 @@ add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_de
 add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
@@ -45,6 +47,8 @@ lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_
 #nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
 nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
 nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
+nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
+nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
@@ -61,7 +65,7 @@ nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc
 #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
 #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
 nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
-#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
+#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
 #nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
 
 if(LITE_BUILD_EXTRA)
diff --git a/lite/kernels/cuda/abs_compute.cu b/lite/kernels/cuda/abs_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f00aacc0cdcea07329e3836c7068f419d26f90c
--- /dev/null
+++ b/lite/kernels/cuda/abs_compute.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/abs_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__global__ void AbsKernel(const int num, const T* input, T* output);
+
+template <>
+__global__ void AbsKernel<float>(const int num,
+                                 const float* input,
+                                 float* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = fabsf(input[index]);
+  }
+}
+
+template <>
+__global__ void AbsKernel<double>(const int num,
+                                  const double* input,
+                                  double* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = fabs(input[index]);
+  }
+}
+
+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->data<float>();
+  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  const int threads = 512;
+  const int blocks = (num + threads - 1) / threads;
+  AbsKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    abs, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/abs_compute.h b/lite/kernels/cuda/abs_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1f8a0cc5ac52e01cc8ea920bdad62ef46fd0640
--- /dev/null
+++ b/lite/kernels/cuda/abs_compute.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AbsCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+  virtual ~AbsCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/abs_compute_test.cc b/lite/kernels/cuda/abs_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfbcae56fa51fc59c9917aa112fa5320c2759a9a
--- /dev/null
+++ b/lite/kernels/cuda/abs_compute_test.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/abs_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(abs, normal) {
+  AbsCompute abs_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ActivationParam param;
+
+  Tensor x, y, x_cpu, y_cpu;
+  int h = 3, w = 3;
+  y.Resize({h, w});
+  x_cpu.Resize({h, w});
+  y_cpu.Resize({h, w});
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); i++) {
+    x_cpu_data[i] = i - 1.5;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  param.X = &x;
+  param.Out = &y;
+  abs_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  abs_kernel.SetContext(std::move(ctx));
+  abs_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], std::fabs(x_cpu_data[i]), 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu
index 64759f86f5df85f9855b9c1f186bbc9c039a044c..02b7c8f7d9e829b100e6c96aca2a8cee3ca74ef1 100644
--- a/lite/kernels/cuda/elementwise_compute.cu
+++ b/lite/kernels/cuda/elementwise_compute.cu
@@ -152,6 +152,18 @@ void ElementwiseAddComputeNHWC::Run() {
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
 
+void ElementwiseSubCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kSUB, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseSubComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kSUB, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
 void ElementwiseMulCompute::Run() {
   ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
   cudaError_t error = cudaGetLastError();
@@ -204,6 +216,17 @@ REGISTER_LITE_KERNEL(elementwise_add,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseSubCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_add,
                      kCUDA,
                      kFloat,
@@ -224,6 +247,26 @@ REGISTER_LITE_KERNEL(elementwise_add,
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseSubComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_mul,
                      kCUDA,
                      kFloat,
diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h
index 986a4db2272d9a6607090babd937747f861f49c7..bc9ffd5d27c7b030f397d1b631a155cae5f34678 100644
--- a/lite/kernels/cuda/elementwise_compute.h
+++ b/lite/kernels/cuda/elementwise_compute.h
@@ -38,6 +38,24 @@ class ElementwiseAddComputeNHWC
   virtual ~ElementwiseAddComputeNHWC() = default;
 };
 
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+class ElementwiseSubComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseSubComputeNHWC() = default;
+};
+
 class ElementwiseMulCompute
     : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
  public:
diff --git a/lite/kernels/cuda/tanh_compute.cu b/lite/kernels/cuda/tanh_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f9e2729a7fa0f300308b9f1afcf35e852d11223
--- /dev/null
+++ b/lite/kernels/cuda/tanh_compute.cu
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/tanh_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__global__ void TanhKernel(const int num, const T* input, T* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = tanh(input[index]);
+  }
+}
+
+void TanhCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->data<float>();
+  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  const int threads = 512;
+  const int blocks = (num + threads - 1) / threads;
+  TanhKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    tanh, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::TanhCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/tanh_compute.h b/lite/kernels/cuda/tanh_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b23b27882cc2ef6f5c8e15ba49fbdd5316cbfa3e
--- /dev/null
+++ b/lite/kernels/cuda/tanh_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class TanhCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+  virtual ~TanhCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/tanh_compute_test.cc b/lite/kernels/cuda/tanh_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bc8f25df0bed46254c56d8ec1080e45062bada2
--- /dev/null
+++ b/lite/kernels/cuda/tanh_compute_test.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/tanh_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(tanh, fp32) {
+  TanhCompute tanh_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ActivationParam param;
+
+  Tensor x, y, x_cpu, y_cpu;
+  int h = 3, w = 3;
+  y.Resize({h, w});
+  x_cpu.Resize({h, w});
+  y_cpu.Resize({h, w});
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); i++) {
+    x_cpu_data[i] = i - 1.5;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  param.X = &x;
+  param.Out = &y;
+  tanh_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  tanh_kernel.SetContext(std::move(ctx));
+  tanh_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], tanh(x_cpu_data[i]), 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index f337e518abd071ac262ce9ee47beae1600cc57d1..a52428aa097099150139de82627d5770c9b9071c 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -5,6 +5,3 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
-
-#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
-#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
index 9f4c2fb6f53c17f97cb9a1aaeecd493713899cab..5a09fca72b4bb30ac67b1186cf90c58a5f9a1dd4 100644
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -92,6 +92,7 @@ T PolyIoU(const T* box1,
           const size_t box_size,
           const bool normalized) {
   LOG(FATAL) << "PolyIoU not implement.";
+  return *box1;
 }
 
 template <class T>
@@ -369,6 +370,7 @@ void MulticlassNmsCompute::Run() {
     }
   } else {
     outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    outs->mutable_data<float>();
     int offset = 0;
     int* oindices = nullptr;
     for (int i = 0; i < n; ++i) {
diff --git a/lite/kernels/host/multiclass_nms_compute_test.cc b/lite/kernels/host/multiclass_nms_compute_test.cc
deleted file mode 100644
index 83fb717042515a7a06fe0c014fca7482ad6c8684..0000000000000000000000000000000000000000
--- a/lite/kernels/host/multiclass_nms_compute_test.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/multiclass_nms_compute.h"
-#include <gtest/gtest.h>
-#include <map>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-template <typename dtype>
-static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
-                                    const std::pair<float, dtype>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <typename dtype>
-void get_max_score_index(const dtype* scores,
-                         int num,
-                         float threshold,
-                         int top_k,
-                         std::vector<std::pair<dtype, int>>* score_index_vec) {
-  //! Generate index score pairs.
-  for (int i = 0; i < num; ++i) {
-    if (scores[i] > threshold) {
-      score_index_vec->push_back(std::make_pair(scores[i], i));
-    }
-  }
-
-  //! Sort the score pair according to the scores in descending order
-  std::stable_sort(score_index_vec->begin(),
-                   score_index_vec->end(),
-                   sort_score_pair_descend<int>);
-
-  //! Keep top_k scores if needed.
-  if (top_k > -1 && top_k < score_index_vec->size()) {
-    score_index_vec->resize(top_k);
-  }
-}
-
-template <typename dtype>
-dtype bbox_size(const dtype* bbox, bool normalized = true) {
-  if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
-    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-    return dtype(0.);
-  } else {
-    const dtype width = bbox[2] - bbox[0];
-    const dtype height = bbox[3] - bbox[1];
-
-    if (normalized) {
-      return width * height;
-    } else {
-      // If bbox is not within range [0, 1].
-      return (width + 1) * (height + 1);
-    }
-  }
-}
-
-template <typename dtype>
-dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) {
-  if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
-      bbox2[3] < bbox1[1]) {
-    return dtype(0.);
-  } else {
-    const dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
-    const dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
-    const dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
-    const dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
-
-    const dtype inter_width = inter_xmax - inter_xmin;
-    const dtype inter_height = inter_ymax - inter_ymin;
-    const dtype inter_size = inter_width * inter_height;
-
-    const dtype bbox1_size = bbox_size(bbox1);
-    const dtype bbox2_size = bbox_size(bbox2);
-
-    return inter_size / (bbox1_size + bbox2_size - inter_size);
-  }
-}
-
-template <typename dtype>
-void apply_nms_fast(const dtype* bboxes,
-                    const dtype* scores,
-                    int num,
-                    float score_threshold,
-                    float nms_threshold,
-                    float eta,
-                    int top_k,
-                    std::vector<int>* indices) {
-  // Get top_k scores (with corresponding indices).
-  std::vector<std::pair<dtype, int>> score_index_vec;
-  get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec);
-
-  // Do nms.
-  float adaptive_threshold = nms_threshold;
-  indices->clear();
-
-  while (score_index_vec.size() != 0) {
-    const int idx = score_index_vec.front().second;
-    bool keep = true;
-
-    for (int k = 0; k < indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*indices)[k];
-        float overlap =
-            jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-
-    if (keep) {
-      indices->push_back(idx);
-    }
-
-    score_index_vec.erase(score_index_vec.begin());
-
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename dtype>
-void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
-                                int class_num,
-                                const std::vector<int>& priors,
-                                bool share_location,
-                                std::vector<float>* result) {
-  int background_id = param.background_label;
-  int keep_topk = param.keep_top_k;
-  int nms_topk = param.nms_top_k;
-  float conf_thresh = param.score_threshold;
-  float nms_thresh = param.nms_threshold;
-  float nms_eta = param.nms_eta;
-  const dtype* bbox_data = param.bboxes->data<const dtype>();
-  const dtype* conf_data = param.scores->data<const dtype>();
-  dtype* out = param.out->mutable_data<dtype>();
-  (*result).clear();
-
-  int num_kept = 0;
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  int64_t conf_offset = 0;
-  int64_t bbox_offset = 0;
-  for (int i = 0; i < priors.size(); ++i) {
-    std::map<int, std::vector<int>> indices;
-    int num_det = 0;
-    int num_priors = priors[i];
-
-    int conf_idx = class_num * conf_offset;
-    int bbox_idx =
-        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
-
-    for (int c = 0; c < class_num; ++c) {
-      if (c == background_id) {
-        // Ignore background class
-        continue;
-      }
-
-      const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors;
-      const dtype* cur_bbox_data = bbox_data + bbox_idx;
-
-      if (!share_location) {
-        cur_bbox_data += c * num_priors * 4;
-      }
-
-      apply_nms_fast(cur_bbox_data,
-                     cur_conf_data,
-                     num_priors,
-                     conf_thresh,
-                     nms_thresh,
-                     nms_eta,
-                     nms_topk,
-                     &(indices[c]));
-      num_det += indices[c].size();
-    }
-
-    if (keep_topk > -1 && num_det > keep_topk) {
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-
-      for (auto it = indices.begin(); it != indices.end(); ++it) {
-        int label = it->first;
-        const std::vector<int>& label_indices = it->second;
-
-        for (int j = 0; j < label_indices.size(); ++j) {
-          int idx = label_indices[j];
-          float score = conf_data[conf_idx + label * num_priors + idx];
-          score_index_pairs.push_back(
-              std::make_pair(score, std::make_pair(label, idx)));
-        }
-      }
-
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(),
-                       score_index_pairs.end(),
-                       sort_score_pair_descend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_topk);
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-
-      for (int j = 0; j < score_index_pairs.size(); ++j) {
-        int label = score_index_pairs[j].second.first;
-        int idx = score_index_pairs[j].second.second;
-        new_indices[label].push_back(idx);
-      }
-
-      all_indices.push_back(new_indices);
-      num_kept += keep_topk;
-    } else {
-      all_indices.push_back(indices);
-      num_kept += num_det;
-    }
-    conf_offset += num_priors;
-    bbox_offset += num_priors;
-  }
-
-  if (num_kept == 0) {
-    (*result).clear();
-    (*result).resize(1);
-    (*result)[0] = -1;
-    return;
-  } else {
-    (*result).resize(num_kept * 6);
-  }
-
-  int count = 0;
-
-  conf_offset = 0;
-  bbox_offset = 0;
-  for (int i = 0; i < priors.size(); ++i) {
-    int num_priors = priors[i];
-    int conf_idx = class_num * conf_offset;
-    int bbox_idx =
-        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
-
-    for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
-      int label = it->first;
-      std::vector<int>& indices = it->second;
-      const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors;
-      const dtype* cur_bbox_data = bbox_data + bbox_idx;
-
-      if (!share_location) {
-        cur_bbox_data += label * num_priors * 4;
-      }
-
-      for (int j = 0; j < indices.size(); ++j) {
-        int idx = indices[j];
-        (*result)[count * 6] = label;
-        (*result)[count * 6 + 1] = cur_conf_data[idx];
-
-        for (int k = 0; k < 4; ++k) {
-          (*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k];
-        }
-
-        ++count;
-      }
-    }
-    conf_offset += num_priors;
-    bbox_offset += num_priors;
-  }
-}
-
-TEST(multiclass_nms_host, init) {
-  MulticlassNmsCompute multiclass_nms;
-  ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
-  ASSERT_EQ(multiclass_nms.target(), TARGET(kHost));
-}
-
-TEST(multiclass_nms_host, retrive_op) {
-  auto multiclass_nms =
-      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>(
-          "multiclass_nms");
-  ASSERT_FALSE(multiclass_nms.empty());
-  ASSERT_TRUE(multiclass_nms.front());
-}
-
-TEST(multiclass_nms_host, compute) {
-  MulticlassNmsCompute multiclass_nms;
-  operators::MulticlassNmsParam param;
-  lite::Tensor bbox, conf, out;
-  std::vector<float> out_ref;
-
-  for (std::vector<int> priors : {std::vector<int>({2, 2, 2})}) {
-    int N = priors.size();
-    for (bool share_location : {true}) {
-      for (int class_num : {1, 4, 10}) {
-        DDim* bbox_dim;
-        DDim* conf_dim;
-        int M = priors[0];
-        if (share_location) {
-          bbox_dim = new DDim({N, M, 4});
-        } else {
-          bbox_dim = new DDim({class_num, M, 4});
-        }
-        conf_dim = new DDim({N, class_num, M});
-        bbox.Resize(*bbox_dim);
-        conf.Resize(*conf_dim);
-        for (int background_id : {0}) {
-          for (int keep_topk : {1, 5, 10}) {
-            for (int nms_topk : {1, 5, 10}) {
-              for (float nms_eta : {1.0, 0.99, 0.9}) {
-                for (float nms_thresh : {0.5, 0.7}) {
-                  for (float conf_thresh : {0.5, 0.7}) {
-                    auto* conf_data = conf.mutable_data<float>();
-                    auto* bbox_data = bbox.mutable_data<float>();
-                    for (int i = 0; i < bbox_dim->production(); ++i) {
-                      bbox_data[i] = i * 1. / bbox_dim->production();
-                    }
-                    for (int i = 0; i < conf_dim->production(); ++i) {
-                      conf_data[i] = i * 1. / conf_dim->production();
-                    }
-                    param.bboxes = &bbox;
-                    param.scores = &conf;
-                    param.out = &out;
-                    param.background_label = background_id;
-                    param.keep_top_k = keep_topk;
-                    param.nms_top_k = nms_topk;
-                    param.score_threshold = conf_thresh;
-                    param.nms_threshold = nms_thresh;
-                    param.nms_eta = nms_eta;
-                    multiclass_nms.SetParam(param);
-                    multiclass_nms.Run();
-                    auto* out_data = out.mutable_data<float>();
-                    out_ref.clear();
-                    multiclass_nms_compute_ref<float>(
-                        param, class_num, priors, share_location, &out_ref);
-                    EXPECT_EQ(out.dims().production(), out_ref.size());
-                    if (out.dims().production() == out_ref.size()) {
-                      auto* out_ref_data = out_ref.data();
-                      for (int i = 0; i < out.dims().production(); i++) {
-                        EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        delete bbox_dim;
-        delete conf_dim;
-      }
-    }
-  }
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/activation_buffer_compute.cc b/lite/kernels/opencl/activation_buffer_compute.cc
index c662aa89fb257aded70119ea14494111398f0529..03ccdac99e5f11e1c056374463f7a8068dbd4f56 100644
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -32,8 +32,10 @@ class ReluCompute
   std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/relu_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -46,7 +48,7 @@ class ReluCompute
     auto* x_buf = param.X->data<float, cl::Buffer>();
     auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
@@ -74,6 +76,7 @@ class ReluCompute
  private:
   std::string kernel_func_name_{"relu"};
   std::string build_options_{"-DCL_DTYPE_float -DRELU"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -87,8 +90,10 @@ class SigmoidCompute
   }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/sigmoid_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -101,7 +106,7 @@ class SigmoidCompute
     auto* x_buf = param.X->data<float, cl::Buffer>();
     auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
@@ -129,6 +134,7 @@ class SigmoidCompute
  private:
   std::string kernel_func_name_{"sigmoid"};
   std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
index dbe487ba91d00c2de4c08edf140526d727bac6b5..a99e588eccd79eb35a5e7c0f3da73471849ab581 100644
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
   }
 
   void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
     act_param_ = param_.get_mutable<param_t>();
     int act_type = static_cast<int>(act_param_->active_type);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
             << ActivationTypeToStr(act_param_->active_type);
+#endif
     switch (act_type) {
       case 1:
         kernel_func_name_ = "relu";
@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
         LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
         return;
     }
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/activation_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_img = param.X->data<half_t, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/activation_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
 
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+  void ReInitWhenNeeded() override {
+    act_param_ = param_.get_mutable<param_t>();
+    auto x_dims = act_param_->X->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      x_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->X->dims());  // w, h
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->Out->dims());  // w, h
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                    static_cast<cl::size_type>(x_img_shape_[1])};
+  }
+
+  void Run() override {
+    auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
+    auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold_);
+    status = kernel.setArg(2, threshold_);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale_);
+    status = kernel.setArg(3, scale_);
     CL_CHECK_FATAL(status);
 
 #ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
+    const auto& x_dims = act_param_->X->dims();
+    const auto& y_dims = act_param_->Out->dims();  // useless: check dim only
+    VLOG(4) << TargetToStr(act_param_->X->target());
+    VLOG(4) << TargetToStr(act_param_->Out->target());
+    VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
+            << x_img_shape_[1];
     VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
             << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
     VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
@@ -115,13 +145,12 @@ class ActivationComputeImageDefault
     VLOG(4) << "kernel func name:" << kernel_func_name_;
 #endif
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
@@ -131,10 +160,20 @@ class ActivationComputeImageDefault
 
  private:
   param_t* act_param_{nullptr};
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim last_x_dims_;
   std::string kernel_func_name_{};
   float threshold_{6.f};
   float scale_{1.f};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 }  // namespace opencl
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
index 7e32010c0b5ff5cedad8b0da7ce7233fbf73da6f..53f260789e12a94dc39f785df12a8e988d08bcbe 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
     bilinear_interp_param_ = param_.get_mutable<param_t>();
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/bilinear_interp_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -159,6 +161,7 @@ class BilinearInterpImageCompute
   param_t* bilinear_interp_param_{nullptr};
   std::string kernel_func_name_{"bilinear_interp"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc
index 010e7726170ab1f40adc2fcb56a66835ac7d2bd2..414f62ff0c4f86f29756b933817de2a7682ecd4c 100644
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
     } else {
       kernel_func_name_ = "concat_mul";
     }
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
 
     auto axis = concat_param_->axis;
     auto inputs = concat_param_->x;
@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
 
     auto inputs = param.x;
     int arg_idx = 0;
@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
   param_t* concat_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index 95e64025662a4b87cd68c211ccc0b0fb7b84a9f2..60d1ac628ab1474d7e82f1861067bca838548569 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
       kernel_func_name_ = "concat_mul";
     }
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/concat_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
 
     auto axis = concat_param_->axis;
     auto inputs = concat_param_->x;
@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
 
     auto inputs = param.x;
     int arg_idx = 0;
@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
   param_t* concat_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/conv_buffer_compute.cc b/lite/kernels/opencl/conv_buffer_compute.cc
index 65477e89c7d00408bf4d639138dea936a61a3d70..4c118e1263c0d3c23eb223b01b98a8d9a53bac0e 100644
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
   }
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+    context.cl_context()->AddKernel(kernel_func_names_[i],
+                                    kernel_func_paths_[i],
+                                    build_options_[i],
+                                    time_stamp_);
   }
 }
 
@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
 
   auto& context = ctx_->As<OpenCLContext>();
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
   auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int n_threads = c_in * h_out * w_out;
@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() {
   int n = h_out * w_out;
   VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
   kernel_key.str("");
-  kernel_key << kernel_func_names_[1] << build_options_[1];
+  kernel_key << kernel_func_names_[1] << build_options_[1] << time_stamp_;
   auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
   GemmBatched(
       gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() {
 
   auto& context = ctx_->As<OpenCLContext>();
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_.front() << build_options_.front();
+  kernel_key << kernel_func_names_.front() << build_options_.front()
+             << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
diff --git a/lite/kernels/opencl/conv_buffer_compute.h b/lite/kernels/opencl/conv_buffer_compute.h
index 44ada55d92352edf3c64cd653e832b26718cdd2f..3dabe906f128ef96fb03dfa82ab3847febaeeed5 100644
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -21,6 +21,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -55,6 +56,7 @@ class ConvCompute
   std::vector<std::string> kernel_func_names_{};
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index a409690a2e37750c88395a592565b9e968e62845..aadd7010cca2ec03ea417e3b486d8c946d80fcab 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() {
   build_options_.push_back(build_options_single);
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+    context.cl_context()->AddKernel(kernel_func_names_[i],
+                                    kernel_func_paths_[i],
+                                    build_options_[i],
+                                    time_stamp_);
   }
 
   VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
   kernel_ = context.cl_context()->GetKernel(kernel_key.str());
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
@@ -388,18 +390,43 @@ void ConvImageCompute::PrepareForRun() {
 
   VLOG(4) << "max_work_group_size: " << max_work_group_size;
 
-  if (max_work_group_size > 0 && use_lws) {
-    // local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
-    //     global_work_size_, max_work_group_size);
-    local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
-                                                           max_work_group_size);
-
+  if (max_work_group_size > 0 && use_lws_) {
+    double min_turn_time = DBL_MAX;
+    cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
+        global_work_size_, max_work_group_size);
+    cl::NDRange last_local_work_size = cl::NDRange{
+        static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
+    if (use_turn_) {
+      for (size_t i = 1; i < 15; i++) {
+        if (kernel_h == 1 && kernel_w == 1) {
+          // todo use diff logics
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+              global_work_size_, max_work_group_size, i);
+        } else {
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+              global_work_size_, max_work_group_size, i);
+        }
+        if (last_local_work_size[0] == local_work_size_[0] &&
+            last_local_work_size[1] == local_work_size_[1] &&
+            last_local_work_size[2] == local_work_size_[2]) {
+          // skiped turned lws
+          continue;
+        }
+        auto turn_time = this->Turn(5);
+        if (min_turn_time > turn_time) {
+          min_turn_time = turn_time;
+          best_local_work_size = local_work_size_;
+        }
+        last_local_work_size = local_work_size_;
+      }
+    }
+    local_work_size_ = best_local_work_size;
     VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
             << local_work_size_[1] << "," << local_work_size_[2] << "}";
   }
 }
 
-void ConvImageCompute::Conv2d1x1opt() {
+void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -431,23 +458,6 @@ void ConvImageCompute::Conv2d1x1opt() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-// const std::vector<size_t>& default_work_size =
-//     DefaultWorkSize(output_dims,
-//                     DDim(std::vector<DDim::value_type>{
-//                         static_cast<int64_t>(out_image_shape["width"]),
-//                         static_cast<int64_t>(out_image_shape["height"])}));
-
-// int c_block = default_work_size[0];
-// int w = default_work_size[1];
-// int nh = default_work_size[2];
-
-// int maped_w = maptofactor(w, 4);
-
-// auto global_work_size_ =
-//     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-//                 static_cast<size_t>(maped_w),
-//                 static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
   //  VLOG(4) << "out_image: " << out_image;
   VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
@@ -541,73 +551,12 @@ void ConvImageCompute::Conv2d1x1opt() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
-
-#ifdef PROFILE_CONV_KERNEL
-  bool use_profile = false;
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  double start = GetCurrentUS();
-
-  if (use_profile) {
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size_,
-        local_work_size_,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
-  } else {
-    int count = 50;
-    double sumtime = 0;
-    if (!use_profile) {
-      count = 1;
-    }
-    for (size_t i = 0; i < count; i++) {
-      start = GetCurrentUS();
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size_,
-          local_work_size_,
-          nullptr,
-          event_.get());
-      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_image, event_);
-      if (use_profile) {
-        event_->wait();
-        double duration = GetCurrentUS() - start;
-        sumtime += duration;
-      }
-    }
-
-    auto dims_string = [](DDimLite dims) -> std::string {
-      std::ostringstream stream;
-      stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
-             << dims[3] << "]";
-      return stream.str();
-    };
-    if (use_profile) {
-      // LOG(INFO) << "input: " << input_dims;
-      // LOG(INFO) << "filter: " << filter_dims;
-      // LOG(INFO) << "output: " << output_dims;
-
-      std::cout << std::setw(25) << std::left << dims_string(input_dims)
-                << std::setw(25) << std::left << dims_string(filter_dims)
-                << std::setw(25) << std::left << dims_string(output_dims)
-                << std::setw(25) << std::left << sumtime / count << std::endl;
-    } else {
-      dims_string(input_dims);
-    }
+  if (is_turn) {
+    event_->wait();
   }
-#endif
 }
 
-void ConvImageCompute::Conv2d3x3() {
+void ConvImageCompute::Conv2d3x3(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -767,9 +716,13 @@ void ConvImageCompute::Conv2d3x3() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d3x3opt() {
+void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -890,9 +843,12 @@ void ConvImageCompute::Conv2d3x3opt() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d5x5() {
+void ConvImageCompute::Conv2d5x5(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1018,9 +974,12 @@ void ConvImageCompute::Conv2d5x5() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d5x5opt() {
+void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1134,9 +1093,12 @@ void ConvImageCompute::Conv2d5x5opt() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::Conv2d7x7() {
+void ConvImageCompute::Conv2d7x7(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1262,8 +1224,12 @@ void ConvImageCompute::Conv2d7x7() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
-void ConvImageCompute::Conv2d7x7opt() {
+void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1374,8 +1340,12 @@ void ConvImageCompute::Conv2d7x7opt() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
-void ConvImageCompute::DepthwiseConv2d3x3s1() {
+void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1454,9 +1424,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(output_img, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::DepthwiseConv2d3x3() {
+void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1548,9 +1522,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(output_img, event_);
+
+  if (is_turn) {
+    event_->wait();
+  }
 }
 
-void ConvImageCompute::DepthwiseConv2d() {
+void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
@@ -1683,8 +1661,22 @@ void ConvImageCompute::DepthwiseConv2d() {
   context.cl_wait_list()->emplace(out_image, event_);
 }
 
-void ConvImageCompute::Run() { (this->*impl_)(); }
-#undef PROFILE_CONV_KERNEL
+void ConvImageCompute::Run() { (this->*impl_)(false); }
+
+double ConvImageCompute::Turn(int times) {
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start = GetCurrentUS();
+  for (size_t i = 0; i < times; i++) {
+    (this->*impl_)(true);
+  }
+  auto time_diff = (GetCurrentUS() - start) / times;
+  return time_diff;
+}
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index c30c271498737acf3b831d7799af1b5b316e95de..6f293a0d7dd90e55bedd63c214ba38799a591080 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -22,40 +22,42 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                            PRECISION(kFP16),
                                            DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ConvParam;
-  using kernel_t = void (ConvImageCompute::*)();
+  using kernel_t = void (ConvImageCompute::*)(bool);
 
   void PrepareForRun() override;
 
   void Run() override;
+  double Turn(int times = 5);
 
  private:
-  void Conv2d1x1opt();
-  void Conv2d3x3();
-  void Conv2d3x3opt();
-  void Conv2d5x5();
-  void Conv2d5x5opt();
-  void Conv2d7x7();
-  void Conv2d7x7opt();
-  void DepthwiseConv2d3x3s1();
-  void DepthwiseConv2d3x3();
-  void DepthwiseConv2d();
+  void Conv2d1x1opt(bool is_turn = false);
+  void Conv2d3x3(bool is_turn = false);
+  void Conv2d3x3opt(bool is_turn = false);
+  void Conv2d5x5(bool is_turn = false);
+  void Conv2d5x5opt(bool is_turn = false);
+  void Conv2d7x7(bool is_turn = false);
+  void Conv2d7x7opt(bool is_turn = false);
+  void DepthwiseConv2d3x3s1(bool is_turn = false);
+  void DepthwiseConv2d3x3(bool is_turn = false);
+  void DepthwiseConv2d(bool is_turn = false);
 
   kernel_t impl_;
   std::vector<std::string> kernel_func_names_{};
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
   Tensor filter_gpu_image_;
   Tensor bias_gpu_image_;
@@ -72,7 +74,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   cl::Kernel kernel_;
   cl::NDRange local_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
-  bool use_lws{true};
+  bool use_lws_{true};
+  bool use_turn_{false};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
index 0c88509926041411eddac66bea08b5d3a08d6a3c..afe2aa1c66c04d2bdf180a77362e5d6f1271c1f6 100644
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
       build_options_ += " -DRELU6";
     }
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/depthwise_conv2d_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/depthwise_conv2d_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute
  private:
   std::string kernel_func_name_{"depthwise_conv2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
index 490e34a8868a3f625591a1c621aa297bb0639576..2be5af2ef0bf3e30d1c586d57ed6c3d40d625b14 100644
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/dropout_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/dropout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     cl_int status;
 
@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"dropout"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
index 3961ac7583917fdcd761614558c493e6917d3294..b70f7d1ee017566e399ac86d35df56bd4ba4d383 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -25,8 +25,10 @@ namespace opencl {
 
 void ElementwiseAddCompute::PrepareForRun() {
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "buffer/elementwise_add_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
   ele_param_ = param_.get_mutable<param_t>();
   UpdateParams();
 }
@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
   auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
       TARGET(kOpenCL));
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << TargetToStr(ele_param_->X->target());
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.h b/lite/kernels/opencl/elementwise_add_buffer_compute.h
index 5a9266ee69b81416d5f4dea9a3eb38aaed7b4165..7dbe5d0e8d5172386418d547812bf4e6c269f043 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -46,6 +47,7 @@ class ElementwiseAddCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index 6d0ebf638f0a8967e27a657131e1cac89967ee0b..51d488d51b72dd9af8225b45a7ee56063312d055 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -23,44 +23,84 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-void ElementwiseAddImageCompute::PrepareForRun() {
-  ele_param_ = param_.get_mutable<param_t>();
-  auto* x = ele_param_->X;
-  auto* y = ele_param_->Y;
-  auto axis = ele_param_->axis;
+void ElementwiseAddImageCompute::PrepareForRun() {}
 
-  if (y->dims().size() == 4) {
-    kernel_func_name_ = "elementwise_add";  // y: ImageDefault
-  } else if (y->dims().size() == 1) {
-    if (axis == x->dims().size() - 1) {
-      kernel_func_name_ = "width_add";  // y: ImageDefault
-    } else if (axis == x->dims().size() - 3) {
-      kernel_func_name_ = "channel_add";  // y: ImageFolder
+void ElementwiseAddImageCompute::ReInitWhenNeeded() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto x_dims = ele_param_->X->dims();
+  if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+      first_epoch_for_reinit_) {
+    last_x_dims_ = x_dims;
+    first_epoch_for_reinit_ = false;
+
+    // choose kernel
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    auto* out = ele_param_->Out;
+    auto axis = ele_param_->axis;
+
+    if (y->dims().size() == 4) {
+      kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+    } else if (y->dims().size() == 1) {
+      if (axis == x->dims().size() - 1) {
+        kernel_func_name_ = "width_add";  // y: ImageDefault
+      } else if (axis == x->dims().size() - 3) {
+        kernel_func_name_ = "channel_add";  // y: ImageFolder
+      } else {
+        LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                   << ", x->dims().size():" << x->dims().size()
+                   << ", y->dims.size():" << y->dims().size();
+      }
     } else {
       LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
                  << ", x->dims().size():" << x->dims().size()
                  << ", y->dims.size():" << y->dims().size();
     }
-  } else {
-    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
-               << ", y->dims.size():" << y->dims().size();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+    // compute image shape
+    paddle::lite::CLImageConverterDefault default_convertor;
+    x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+    y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
+    out_img_shape_ =
+        default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+
+    // compute global work size
+    GetGlobalWorkSize();
   }
-  VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+}
 
-  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+void ElementwiseAddImageCompute::GetGlobalWorkSize() {
+  global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                                  static_cast<cl::size_type>(x_img_shape_[1])};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
+          << x_img_shape_[1];
+#endif
 }
 
 void ElementwiseAddImageCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-
   auto* x = ele_param_->X;
   auto* y = ele_param_->Y;
   auto* out = ele_param_->Out;
   auto axis = ele_param_->axis;
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* x_img = x->data<half_t, cl::Image2D>();
+  auto* y_img = y->data<half_t, cl::Image2D>();
+  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                         out_img_shape_[1]);
 
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() {
   VLOG(4) << "y->dims():" << y->dims();
   VLOG(4) << "out->dims():" << out->dims();
   VLOG(4) << "axis:" << axis;
-#endif
-
-  paddle::lite::CLImageConverterDefault default_convertor;
-  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
-  auto x_img_width = x_img_shape[0];
-  auto x_img_height = x_img_shape[1];
-  auto out_img_shape =
-      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
-  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
 
-  auto* x_img = x->data<half_t, cl::Image2D>();
-  auto* y_img = y->data<half_t, cl::Image2D>();
-  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
-                                                         out_img_shape[1]);
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
-  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
-  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
-          << out_img_shape[1];
+  VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
+  VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
+  VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
+          << out_img_shape_[1];
 #endif
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int arg_idx = 0;
-  auto y_dims = y->dims();
+  cl_int status;
+  auto kernel = kernel_;
   if (y_dims.size() == 4) {
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_img);
+    status = kernel.setArg(1, *y_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(2, *out_img);
     CL_CHECK_FATAL(status);
   } else if (y_dims.size() == 1) {
-    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
-      int tensor_w = x->dims()[x->dims().size() - 1];
+    if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
+      const int tensor_w = x_dims[x_dims.size() - 1];
 #ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
 #endif
-      cl_int status = kernel.setArg(arg_idx, *x_img);
+      status = kernel.setArg(0, *x_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *y_img);
+      status = kernel.setArg(1, *y_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
+      status = kernel.setArg(2, *out_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      status = kernel.setArg(3, tensor_w);
       CL_CHECK_FATAL(status);
     } else {
       LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-                 << ", x->dims().size():" << x->dims().size()
-                 << ", y->dims.size():" << y->dims().size();
+                 << ", x->dims().size():" << x_dims.size()
+                 << ", y->dims.size():" << y_dims.size();
     }
   } else {
     LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
-               << ", y->dims.size():" << y->dims().size();
+               << ", x->dims().size():" << x_dims.size()
+               << ", y->dims.size():" << y_dims.size();
   }
 
-  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
-                                      static_cast<cl::size_type>(x_img_height)};
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
-#endif
-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h
index 084f0fe7fb18f9abe3c6ef41f10a9e38e31a54fc..a92a1b448176628381a3c65b838f6bba529eb4e0 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -15,8 +15,10 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
 
   void PrepareForRun() override;
 
+  void ReInitWhenNeeded() override;
+
+  void GetGlobalWorkSize();
+
   void Run() override;
 
   std::string doc() const override {
@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute
 
  protected:
   param_t* ele_param_{nullptr};
+  DDim last_x_dims_;
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+  bool first_epoch_for_reinit_{true};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index aa6af2a29bfdedfb5fdd3114693514b6fad13a64..96dc2de1affba7c36be6c9c0e952b85be726fca8 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute
     VLOG(4) << "bias_dims.size():" << bias_dims.size();
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_mul_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     auto bias_dims = y->dims();
@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_mul"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index 0bc867d7f124582660b7a0a9a95d026d910fc2d3..b93167b99c064a2f9eb2256291adad99f3912baf 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
   VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
 
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "image/elementwise_sub_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
 }
 
 void ElementwiseSubImageCompute::Run() {
@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
 #endif
 
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int arg_idx = 0;
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.h b/lite/kernels/opencl/elementwise_sub_image_compute.h
index 48386b083e5375f8943c04afb1da70a2ed207dbf..db3e1db9813bffd985a41abbac14e5c89e574397 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_sub"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index dbdedd136ea6b8c6b06d02d4f6d893e4ea849e8a..0fb83db2fe76e27baf7a096395369cb92b995072 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -30,74 +31,98 @@ class FcCompute
  public:
   using param_t = operators::FcParam;
 
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto x_dims = param.input->dims();
-    const auto w_dims = param.w->dims();
-
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_GE(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-    VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-            << " " << x_dims[3];
-    VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
-            << " " << w_dims[3];
-    VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+  void PrepareForRun() override {}
 
+  void ReInitWhenNeeded() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    const auto x_dims = fc_param_->input->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute m,n,k
+      const auto w_dims = fc_param_->w->dims();
+      CHECK_GE(x_dims.size(), 2UL);
+      CHECK_GE(w_dims.size(), 2UL);
+      CHECK_EQ(fc_param_->output->dims().size(), 2UL);
+
+      m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
+      k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
+      n_ = w_dims[1];
+      CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+              << " " << x_dims[3];
+      VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
+              << " " << w_dims[3];
+      VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+#endif
+
+      // choose kernel
+      if (m_ == 1) {  // gemv
+        kernel_func_name_ = "fc_gemv_1x4";
+      } else {  // gemm
+        kernel_func_name_ = "fc_gemm_4x4";
+      }
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+#endif
+
+      if (fc_param_->activation_type == "relu") {
+        build_options_ += "-DRELU";
+      }
+
+      auto& context = ctx_->As<OpenCLContext>();
+      context.cl_context()->AddKernel(kernel_func_name_,
+                                      "buffer/fc_kernel.cl",
+                                      build_options_,
+                                      time_stamp_);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+      kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
     if (m_ == 1) {  // gemv
-      kernel_func_name_ = "fc_gemv_1x4";
       global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
     } else {  // gemm
-      kernel_func_name_ = "fc_gemm_4x4";
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                       static_cast<size_t>((n_ + 3) / 4)};
     }
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-
-    if (param.activation_type == "relu") {
-      build_options_ += "-DRELU";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
   }
 
   void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.input->data<float, cl::Buffer>();
-    auto* w_buf = param.w->data<float, cl::Buffer>();
-    auto* bias_buf = param.bias->data<float, cl::Buffer>();
+    auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
+    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
+    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
     auto* out_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+        fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
+    auto kernel = kernel_;
     cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_buf);
+    status = kernel.setArg(0, *x_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *w_buf);
+    status = kernel.setArg(1, *w_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *bias_buf);
+    status = kernel.setArg(2, *bias_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(3, *out_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
+    status = kernel.setArg(4, static_cast<const int>(m_));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
+    status = kernel.setArg(5, static_cast<const int>(n_));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
+    status = kernel.setArg(6, static_cast<const int>(k_));
     CL_CHECK_FATAL(status);
 
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -111,9 +136,14 @@ class FcCompute
 
  private:
   int m_, n_, k_;
+  param_t* fc_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float "};
+  std::string time_stamp_{GetTimeStamp()};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
   cl::NDRange global_work_size_;
+  cl::Kernel kernel_;
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
index d76e00fa85d4ebb6da9d779e9c2b220a2fd731d9..730b70525e818512aea11e1f42c1282b125aae54 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
   void PrepareForRun() override {
     build_options_ += " -DRELU";
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     ele_param_ = param_.get_mutable<param_t>();
     UpdateParams();
     auto act_t = static_cast<param_t*>(ele_param_)->act_type;
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
index e5c0e29bddf5cd6c25ccf98f05aa7cb091a4be7e..8e687340943dcb0f1b68e4c9495cbab1ad703645 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/opencl/elementwise_add_image_compute.h"
+#include "lite/kernels/opencl/image_helper.h"
 
 namespace paddle {
 namespace lite {
@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
   void PrepareForRun() override {
     build_options_ += " -DRELU";
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     ele_param_ = param_.get_mutable<param_t>();
     auto act_t = static_cast<param_t*>(ele_param_)->act_type;
     VLOG(4) << "act: " << act_t;
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
index 243737a81331a7159834d30ccfb2fab181baeebe..4fb13a61181ba282f7005ea158768ee18b94b7a0 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
   }
 
   void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/grid_sampler_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << "kernel_key: " << kernel_key.str();
+  }
+
+  void ReInitWhenNeeded() override {
     grid_param_ = param_.get_mutable<param_t>();
+    auto x_dims = grid_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
 
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
-    VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  void GetGlobalWorkSize() {
+    auto default_work_size =
+        DefaultWorkSize(grid_param_->out->dims(),
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_img_shape_[0]),
+                            static_cast<int64_t>(out_img_shape_[1])}));
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2] / 4)};
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
+            << global_work_size_[1] << " " << global_work_size_[2];
+#endif
   }
 
   void Run() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-
     auto* x = grid_param_->x;
-    auto* out = grid_param_->out;
     auto* grid = grid_param_->grid;
+    auto* out = grid_param_->out;
+
     auto out_dims = out->dims();
-    auto in_dims = x->dims();
+    int out_height = out_dims[2];
+    int out_width = out_dims[3];
+
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* grid_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                           out_img_shape_[1]);
 
 #ifndef LITE_SHUTDOWN_LOG
+    auto in_dims = x->dims();
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
     VLOG(4) << "out->dims():" << out_dims;
-#endif
-
-    auto out_image_shape = InitImageDimInfoWith(out_dims);
-    auto* x_img = x->data<half_t, cl::Image2D>();
     // VLOG(4) << "x_image: " << x_img;
-
-    auto* grid_img = x->data<half_t, cl::Image2D>();
     // VLOG(4) << "grid_img: " << grid_img;
-
-    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-#ifndef LITE_SHUTDOWN_LOG
     // VLOG(4) << "out_image" << out_img;
-    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
-            << out_image_shape["height"];
+    VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
+            << out_img_shape_[1];
 #endif
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
-    int arg_idx = 0;
-    int out_height = out_dims[2];
-    int out_width = out_dims[3];
-    auto default_work_size =
-        DefaultWorkSize(out_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
-            << default_work_size[1] << ", " << default_work_size[2];
-#endif
-    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    cl_int status;
+    auto kernel = kernel_;
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *grid_img);
+    status = kernel.setArg(1, *grid_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *out_img);
+    status = kernel.setArg(2, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_height);
+    status = kernel.setArg(3, out_height);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_width);
+    status = kernel.setArg(4, out_width);
     CL_CHECK_FATAL(status);
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
-                    static_cast<cl::size_type>(default_work_size[1]),
-                    static_cast<cl::size_type>(default_work_size[2] / 4)};
-
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
-            << global_work_size[1] << " " << global_work_size[2];
-#endif
   }
 
  protected:
   param_t* grid_param_{nullptr};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
   std::string kernel_func_name_{"grid_sampler"};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/image_helper.h b/lite/kernels/opencl/image_helper.h
index d0d282250d1c5658bc8f684b52b4b0d140895833..81d38bc683eb355b1d85a307d35839b4e3e8ef45 100644
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
   LOG(FATAL) << " not support this dim, need imp ";
 }
 
+static const std::string GetTimeStamp() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return std::to_string(time.tv_usec);
+}
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index 6bdec0ca6cdfd16219becf704de4d5701aad3197..c5e02ae0ed4ae9facf36747d99ee825e6eab6515 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     }
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/instance_norm_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
         out_image_shape["width"], out_image_shape["height"]);
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status = kernel.setArg(0, out_w);
@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     bias_image_.mutable_data<half_t, cl::Image2D>(
         scale_img_size[0], scale_img_size[1], bias_img.data());
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/instance_norm_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     auto* scale_img = scale_image_.data<half_t, cl::Image2D>();
     auto* bias_img = bias_image_.data<half_t, cl::Image2D>();
@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
   param_t* instance_norm_param_{nullptr};
   std::string kernel_func_name_{"instance_norm_onnx"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
   Tensor scale_image_;
   Tensor bias_image_;
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
index edce0368ddc9cda54fdab44b472fcd0e771413ae..0e01bdc107c4fcb4a0caf943cfb1b768557dd671 100644
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
     beta_ = lrn_param_->beta;
     norm_region_ = lrn_param_->norm_region;
     context.cl_context()->AddKernel(
-        kernel_func_name_, "image/lrn_kernel.cl", build_options_);
+        kernel_func_name_, "image/lrn_kernel.cl", build_options_, time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string norm_region_{"AcrossChannels"};
   std::string kernel_func_name_{"lrn"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/mul_buffer_compute.cc b/lite/kernels/opencl/mul_buffer_compute.cc
index 4c46da67da9877fb37b214b6d738b3dd3da3e5bb..e8edb359898fb47cf47919a25e521ca9f8353104 100644
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -32,8 +33,10 @@ class MulCompute
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/mat_mul_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/mat_mul_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     const auto& param = *param_.get_mutable<param_t>();
     const auto* x_data = param.x->data<float>();
     const auto* y_data = param.y->data<float>();
@@ -68,7 +71,7 @@ class MulCompute
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -103,6 +106,7 @@ class MulCompute
   int m_, n_, k_;
   std::string kernel_func_name_{"mat_mul"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
index 082f21ab1ae792ae33e9e2a368073274258b8884..17637e2569556d1eeb8b6002c0073223345ac7ec 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/nearest_interp_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault
  private:
   std::string kernel_func_name_{"nearest_interp"};
   std::string build_options_{" -DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
index 1be4729ee1b24ac77383de4d7c111e9d37d29d6b..f16642d449d29c2afd3db7097432945c73d107e3 100644
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
     }
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/pad2d_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/pad2d_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
   param_t* pad2d_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/pool_buffer_compute.cc b/lite/kernels/opencl/pool_buffer_compute.cc
index 3f491afb86d4e4d5144522b6fb028c225c9a97e4..aeba4bcd2ea1d9b1f14ac86509ab9dbec2509ad0 100644
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -37,8 +37,10 @@ class PoolCompute
     const auto& param = *param_.get_mutable<param_t>();
     kernel_func_name_ += param.pooling_type;
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/pool_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -69,7 +71,7 @@ class PoolCompute
     auto* output_buf =
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     cl_int status;
     auto numel = out_dims.production();
@@ -117,6 +119,7 @@ class PoolCompute
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
index 39da325ebb10c85f153e349173aa833bbf5e1f6e..34524122c8e475df63db02eae32b7d100abfa2d9 100644
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
-        kernel_func_name_, "image/pool_kernel.cl", build_options_);
+        kernel_func_name_, "image/pool_kernel.cl", build_options_, time_stamp_);
   }
 
   void Run() override {
@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     //    VLOG(4) << "out_image" << out_img;
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int c_block = (out_dims[1] + 3) / 4;
@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
index 376add226216a57a0868c9c52497b784929a207e..febb1c33d9c4df2cb58580a03bda1eff93ed4da7 100644
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/reshape_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/reshape_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
 #ifndef LITE_SHUTDOWN_LOG
@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"reshape"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
index 5fd9a2b46b5ce3b0ad84449785f510d5f0391250..97b56e68d47fcdf1647433f5e267c264fb36c5c2 100644
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/scale_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+
+  void ReInitWhenNeeded() override {
+    scale_param_ = param_.get_mutable<param_t>();
+    auto x_dims = scale_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
+                    static_cast<cl::size_type>(out_img_shape_[1])};
   }
 
   void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto& in_dims = param.x->dims();
-    auto* x_img = param.x->data<half_t, cl::Image2D>();
-    const float scale = param.scale;
-    const float bias = param.bias;
-
-    //    LOG(INFO) << "x_image" << x_img;
-    auto out_image_shape = InitImageDimInfoWith(in_dims);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
-            << out_image_shape["height"];
-#endif
-    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-    //    LOG(INFO) << "out_image" << out_img;
+    auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
+    auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+    const float scale = scale_param_->scale;
+    const float bias = scale_param_->bias;
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
-                    static_cast<cl::size_type>(out_image_shape["height"])};
 
+    auto kernel = kernel_;
     cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale);
+    status = kernel.setArg(2, scale);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, bias);
+    status = kernel.setArg(3, bias);
     CL_CHECK_FATAL(status);
 
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"scale"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
+
+  param_t* scale_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/slice_image_compute.cc b/lite/kernels/opencl/slice_image_compute.cc
index 149ef35afe3d49ca8793769ee7ad366292462296..dd231ec8647ba88ab0f953661af47bc36c948e8b 100644
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/slice_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/slice_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"slice"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt
index 29cb83b2b853d4953bfbe7faca8633f2789e1d50..93f3cdb445af7b75adc76294b287d9963f4e3cca 100644
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x
 lite_cc_library(subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS ${xpu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS ${xpu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_yolo_box_op_xpu SRCS yolo_box_op.cc DEPS ${xpu_subgraph_bridge_deps})
 
 set(xpu_subgraph_bridges
         subgraph_bridge_registry
@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges
         subgraph_bridge_dropout_op_xpu
         subgraph_bridge_matmul_op_xpu
         subgraph_bridge_cast_op_xpu
+        subgraph_bridge_yolo_box_op_xpu
         CACHE INTERNAL "xpu_subgraph_bridges")
 
 message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h
index 0c7886c5b2b431db7ba97d8557fb6a49750bd468..cf896426f7a40ae17cd73547071f86dcfa738839 100644
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU);
 USE_SUBGRAPH_BRIDGE(dropout, kXPU);
 USE_SUBGRAPH_BRIDGE(matmul, kXPU);
 USE_SUBGRAPH_BRIDGE(cast, kXPU);
+USE_SUBGRAPH_BRIDGE(yolo_box, kXPU);
diff --git a/lite/kernels/xpu/bridges/yolo_box_op.cc b/lite/kernels/xpu/bridges/yolo_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1b7c014702aa8530d5b502bb6d32825e7bb13b2
--- /dev/null
+++ b/lite/kernels/xpu/bridges/yolo_box_op.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto img_size_name = op_info->Input("ImgSize").front();
+  auto img_size = scope->FindTensor(img_size_name);
+
+  auto boxes_name = op_info->Output("Boxes").front();
+  auto scores_name = op_info->Output("Scores").front();
+
+  auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
+  auto class_num = op_info->GetAttr<int>("class_num");
+  auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
+  auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // ImgSize node
+  std::shared_ptr<Node> img_size_node = nullptr;
+  if (graph->Has(img_size_name)) {
+    img_size_node = graph->Get(img_size_name);
+  } else {
+    img_size_node = graph->Add(img_size_name, *img_size);
+  }
+
+  // Softmax node
+  auto yolo_box_data =
+      graph->builder_.CreateYoloBox(*x_node->data(),
+                                    *img_size_node->data(),
+                                    CvtShape<xtcl::Integer>(anchors),
+                                    class_num,
+                                    conf_thresh,
+                                    downsample_ratio);
+  graph->Add(boxes_name, graph->builder_.GetField(yolo_box_data, 0));
+  graph->Add(scores_name, graph->builder_.GetField(yolo_box_data, 1));
+
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(yolo_box,
+                         kXPU,
+                         paddle::lite::subgraph::xpu::YoloBoxConverter);
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
index 1b6d374f7396cf1e4e91bfe786603005fb0ff8dc..9c2191331c85a7f99ffb5a2e9662ed5831cb1dda 100644
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   subgraph::xpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() {
       return subgraph::FAILED;
     }
     auto kernel = inst.kernel();
-    status |=
-        bridges.Select(op_type, TARGET(kXPU))(reinterpret_cast<void*>(&graph),
-                                              const_cast<OpLite*>(op),
-                                              const_cast<KernelBase*>(kernel));
+    status |= bridges.Select(op_type, TARGET(kXPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
       return subgraph::FAILED;
     }
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 08e6a303094dc42278bfcb24c54f16bd3819d5c1..7f938577c3b2e53257d4fb79686a0bf8c6a67ad5 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
     pb_dims->Resize(static_cast<int>(dims.size()), 0);
     auto dims_vec = dims.Vectorize();
     std::copy(dims_vec.begin(), dims_vec.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
+    int32_t size = desc.ByteSizeLong();
     os.write(reinterpret_cast<const char *>(&size), sizeof(size));
     auto out = desc.SerializeAsString();
     os.write(out.data(), size);
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 48e27560317c089446e8dbc5040786f34ca962c4..ae9ec3ad47fbc00c91ba06c1597bd65e510b629b 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -141,13 +141,12 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
 
 # 4. training op
 add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
-if (LITE_WITH_TRAIN)
-  add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
-  add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
-  add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
-  add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
-  add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
-endif()
+
+add_operator(mean_grad_op train SRCS mean_grad_op.cc DEPS ${op_DEPS})
+add_operator(activation_grad_ops train SRCS activation_grad_ops.cc DEPS ${op_DEPS})
+add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
+add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
+add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index abaaa1a705c2c6995a7d846c1d9add0dab98867b..13abe0c53e95363e7f54c56819eaac26ef720072 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -71,6 +71,9 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   } else if (opdesc.Type() == "exp") {
     // exp
     param_.active_type = lite_api::ActivationType::kExp;
+  } else if (opdesc.Type() == "abs") {
+    // abs
+    param_.active_type = lite_api::ActivationType::kAbs;
   }
   VLOG(4) << "opdesc.Type():" << opdesc.Type();
 
@@ -92,6 +95,7 @@ REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/multiclass_nms_op.cc b/lite/operators/multiclass_nms_op.cc
index 3102030e4b2cdc40eae369d2a43e9b94287e1873..8e47a828c13f3ca73f033cd0422f8c05be857cbe 100644
--- a/lite/operators/multiclass_nms_op.cc
+++ b/lite/operators/multiclass_nms_op.cc
@@ -42,14 +42,8 @@ bool MulticlassNmsOpLite::CheckShape() const {
 }
 
 bool MulticlassNmsOpLite::InferShapeImpl() const {
-  auto box_dims = param_.bboxes->dims();
-  auto score_dims = param_.scores->dims();
-  auto score_size = score_dims.size();
-  if (score_size == 3) {
-    param_.out->Resize({box_dims[1], box_dims[2], 3});
-  } else {
-    param_.out->Resize({-1, box_dims[2] + 2});
-  }
+  // InferShape is useless for multiclass_nms
+  // out's dim is not sure before the end of calculation
   return true;
 }
 
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 1e221a602a426f3f117c69b9525f2a1d85880ee0..3fdca389bca1ba09ebfe008365b6992b717270d8 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -158,6 +158,21 @@ struct MulParam : ParamBase {
   int y_num_col_dims{1};
   // for int8
   WITH_INT8_CONFIG
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x, y}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 struct MulGradParam : ParamBase {
@@ -226,6 +241,21 @@ struct ScaleParam : ParamBase {
   float scale{1.};
   float bias{};
   bool bias_after_scale{true};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Softmax op
@@ -260,6 +290,21 @@ struct ReshapeParam : ParamBase {
 
   lite::Tensor* xshape{};
   bool inplace{false};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Concat op
@@ -268,6 +313,24 @@ struct ConcatParam : ParamBase {
   lite::Tensor* output{};
   int axis{0};
   lite::Tensor* axis_tensor{};
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      std::vector<const Tensor*> vec;
+      for (auto in : x) {
+        vec.push_back(in);
+      }
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>(vec));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 /// ----------------------- activation operators ----------------------
@@ -370,6 +433,21 @@ struct BatchNormParam : ParamBase {
   float epsilon;
   float momentum;
   DataLayoutType data_layout{DATALAYOUT(kNCHW)};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({y}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Pooling op
@@ -394,6 +472,21 @@ struct PoolParam : ParamBase {
   std::string data_format{"AnyLayout"};
   // for int8
   WITH_INT8_CONFIG
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Dropout op
@@ -418,6 +511,21 @@ struct SplitParam : ParamBase {
   int axis{-1};
   int num{0};
   std::vector<int> sections;
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Transpose op
@@ -429,6 +537,21 @@ struct TransposeParam : ParamBase {
   std::vector<int> axis;
   bool use_mkldnn{false};
   std::string data_format{"AnyLayout"};
+  ///////////////////////////////////////////////////////////////////////////////////
+  //  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 /// ----------------------- element wise operators ----------------------
@@ -754,6 +877,21 @@ struct Im2SequenceParam : ParamBase {
 struct SequenceSoftmaxParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
+  ///////////////////////////////////////////////////////////////////////////////////
+  //  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 struct NormParam : ParamBase {
@@ -984,6 +1122,21 @@ struct SliceParam : ParamBase {
   std::vector<lite::Tensor*> EndsTensorList{};
   lite::Tensor* StartsTensor{nullptr};
   lite::Tensor* EndsTensor{nullptr};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 struct AffineChannelParam : ParamBase {
@@ -1031,6 +1184,21 @@ struct SqueezeParam : ParamBase {
   lite::Tensor* Out{};
   lite::Tensor* XShape{};
   std::vector<int> axes{};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 struct UnsqueezeParam : ParamBase {
@@ -1040,6 +1208,21 @@ struct UnsqueezeParam : ParamBase {
   std::vector<int> axes{};
   const lite::Tensor* axes_tensor{};
   std::vector<const lite::Tensor*> axes_tensor_vct{};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 /// ----------------------- expand operators ----------------------
@@ -1057,6 +1240,21 @@ struct MatMulParam : ParamBase {
   bool transpose_X{false};
   bool transpose_Y{false};
   float alpha{1.0f};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X, Y}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 struct GatherParam : ParamBase {
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index ffed48cdc612bd7d5c7e701b0e198390976b7bef..e108e35af76c6b5f2c5719b650b06d849a2f3887 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -32,6 +32,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
     lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1190197bffdf505fec77c6b22b7871316a2d125
--- /dev/null
+++ b/lite/tests/kernels/multiclass_nms_compute_test.cc
@@ -0,0 +1,491 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static void GetMaxScoreIndex(const std::vector<T>& scores,
+                             const T threshold,
+                             int top_k,
+                             std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(),
+                   sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+void SliceOneClass(const Tensor& items,
+                   const int class_id,
+                   Tensor* one_class_item) {
+  T* item_data = one_class_item->mutable_data<T>();
+  const T* items_data = items.data<T>();
+  const int64_t num_item = items.dims()[0];
+  const int64_t class_num = items.dims()[1];
+  if (items.dims().size() == 3) {
+    int64_t item_size = items.dims()[2];
+    for (int i = 0; i < num_item; ++i) {
+      std::memcpy(item_data + i * item_size,
+                  items_data + i * class_num * item_size + class_id * item_size,
+                  sizeof(T) * item_size);
+    }
+  } else {
+    for (int i = 0; i < num_item; ++i) {
+      item_data[i] = items_data[i * class_num + class_id];
+    }
+  }
+}
+
+template <typename T>
+void NMSFast(const Tensor& bbox,
+             const Tensor& scores,
+             const T score_threshold,
+             const T nms_threshold,
+             const T eta,
+             const int64_t top_k,
+             std::vector<int>* selected_indices,
+             const bool normalized) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+  // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+        // 4: [xmin ymin xmax ymax]
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size,
+                                      normalized);
+        } else {
+          LOG(FATAL) << "not support";
+        }
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const Tensor& scores,
+                   const Tensor& bboxes,
+                   const int scores_size,
+                   std::map<int, std::vector<int>>* indices,
+                   int* num_nmsed_out,
+                   int64_t background_label,
+                   int64_t nms_top_k,
+                   int64_t keep_top_k,
+                   bool normalized,
+                   T nms_threshold,
+                   T nms_eta,
+                   T score_threshold) {
+  int num_det = 0;
+
+  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+  Tensor bbox_slice, score_slice;
+  for (int64_t c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    if (scores_size == 3) {
+      score_slice = scores.Slice<T>(c, c + 1);
+      bbox_slice = bboxes;
+    } else {
+      score_slice.Resize({scores.dims()[0], 1});
+      bbox_slice.Resize({scores.dims()[0], 4});
+      SliceOneClass<T>(scores, c, &score_slice);
+      SliceOneClass<T>(bboxes, c, &bbox_slice);
+    }
+    NMSFast(bbox_slice,
+            score_slice,
+            score_threshold,
+            nms_threshold,
+            nms_eta,
+            nms_top_k,
+            &((*indices)[c]),
+            normalized);
+    if (scores_size == 2) {
+      std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
+    }
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    const T* sdata;
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      if (scores_size == 3) {
+        sdata = scores_data + label * scores.dims()[1];
+      } else {
+        score_slice.Resize({scores.dims()[0], 1});
+        SliceOneClass<T>(scores, label, &score_slice);
+        sdata = score_slice.data<T>();
+      }
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(),
+                     score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    if (scores_size == 2) {
+      for (const auto& it : new_indices) {
+        int label = it.first;
+        std::stable_sort(new_indices[label].begin(), new_indices[label].end());
+      }
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const Tensor& scores,
+                      const Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      const int scores_size,
+                      Tensor* outs,
+                      int* oindices = nullptr,
+                      const int offset = 0) {
+  int64_t class_num = scores.dims()[1];
+  int64_t predict_dim = scores.dims()[1];
+  int64_t box_size = bboxes.dims()[1];
+  if (scores_size == 2) {
+    box_size = bboxes.dims()[2];
+  }
+  int64_t out_dim = box_size + 2;
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->mutable_data<T>();
+  const T* sdata;
+  Tensor bbox;
+  bbox.Resize({scores.dims()[0], box_size});
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    int label = it.first;
+    const std::vector<int>& indices = it.second;
+    if (scores_size == 2) {
+      SliceOneClass<T>(bboxes, label, &bbox);
+    } else {
+      sdata = scores_data + label * predict_dim;
+    }
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      odata[count * out_dim] = label;  // label
+      const T* bdata;
+      if (scores_size == 3) {
+        bdata = bboxes_data + idx * box_size;
+        odata[count * out_dim + 1] = sdata[idx];  // score
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx;
+        }
+      } else {
+        bdata = bbox.data<T>() + idx * box_size;
+        odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx * class_num + label;
+        }
+      }
+      // xmin, ymin, xmax, ymax or multi-points coordinates
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+      count++;
+    }
+  }
+}
+
+class MulticlassNmsComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string type_ = "multiclass_nms";
+  std::string bboxes_ = "bboxes";
+  std::string scores_ = "scores";
+  std::string out_ = "out";
+  DDim bboxes_dims_{};
+  DDim scores_dims_{};
+  int keep_top_k_{2};
+  float nms_threshold_{0.45f};
+  float nms_eta_{1.f};
+  int nms_top_k_{1};
+  int background_label_{-1};
+  float score_threshold_{0.01f};
+  bool normalized_{false};
+
+ public:
+  MulticlassNmsComputeTester(const Place& place,
+                             const std::string& alias,
+                             DDim bboxes_dims,
+                             DDim scores_dims,
+                             int keep_top_k = 2,
+                             float nms_threshold = 0.45f,
+                             float nms_eta = 1.f,
+                             int nms_top_k = 1,
+                             int background_label = 1,
+                             float score_threshold = 0.01f,
+                             bool normalized = false)
+      : TestCase(place, alias),
+        bboxes_dims_(bboxes_dims),
+        scores_dims_(scores_dims),
+        keep_top_k_(keep_top_k),
+        nms_threshold_(nms_threshold),
+        nms_eta_(nms_eta),
+        nms_top_k_(nms_top_k),
+        background_label_(background_label),
+        score_threshold_(score_threshold),
+        normalized_(normalized) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* boxes = scope->FindTensor(bboxes_);
+    auto* scores = scope->FindTensor(scores_);
+    auto* outs = scope->NewTensor(out_);
+    CHECK(outs);
+    outs->set_precision(PRECISION(kFloat));
+
+    auto score_size = scores_dims_.size();
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<uint64_t> batch_starts = {0};
+    int64_t batch_size = scores_dims_[0];
+    int64_t box_dim = bboxes_dims_[2];
+    int64_t out_dim = box_dim + 2;
+    int num_nmsed_out = 0;
+    Tensor boxes_slice, scores_slice;
+    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    for (int i = 0; i < n; ++i) {
+      if (score_size == 3) {
+        scores_slice = scores->Slice<float>(i, i + 1);
+        scores_slice.Resize({scores_dims_[1], scores_dims_[2]});
+        boxes_slice = boxes->Slice<float>(i, i + 1);
+        boxes_slice.Resize({scores_dims_[2], box_dim});
+      } else {
+        auto boxes_lod = boxes->lod().back();
+        scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+        boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+      }
+      std::map<int, std::vector<int>> indices;
+      MultiClassNMS<float>(scores_slice,
+                           boxes_slice,
+                           score_size,
+                           &indices,
+                           &num_nmsed_out,
+                           background_label_,
+                           nms_top_k_,
+                           keep_top_k_,
+                           normalized_,
+                           nms_threshold_,
+                           nms_eta_,
+                           score_threshold_);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    uint64_t num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      outs->Resize({1, 1});
+      float* od = outs->mutable_data<float>();
+      od[0] = -1;
+      batch_starts = {0, 1};
+    } else {
+      outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+      outs->mutable_data<float>();
+      int offset = 0;
+      int* oindices = nullptr;
+      for (int i = 0; i < n; ++i) {
+        if (score_size == 3) {
+          scores_slice = scores->Slice<float>(i, i + 1);
+          boxes_slice = boxes->Slice<float>(i, i + 1);
+          scores_slice.Resize({scores_dims_[1], scores_dims_[2]});
+          boxes_slice.Resize({scores_dims_[2], box_dim});
+        } else {
+          auto boxes_lod = boxes->lod().back();
+          scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+          boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
+        }
+        int64_t s = static_cast<int64_t>(batch_starts[i]);
+        int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+        if (e > s) {
+          Tensor out = outs->Slice<float>(s, e);
+          MultiClassOutput<float>(scores_slice,
+                                  boxes_slice,
+                                  all_indices[i],
+                                  scores_dims_.size(),
+                                  &out,
+                                  oindices,
+                                  offset);
+        }
+      }
+    }
+
+    LoD lod;
+    lod.emplace_back(batch_starts);
+    outs->set_lod(lod);
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(type_);
+    op_desc->SetInput("BBoxes", {bboxes_});
+    op_desc->SetInput("Scores", {scores_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("keep_top_k", keep_top_k_);
+    op_desc->SetAttr("nms_threshold", nms_threshold_);
+    op_desc->SetAttr("nms_eta", nms_eta_);
+    op_desc->SetAttr("nms_top_k", nms_top_k_);
+    op_desc->SetAttr("background_label", background_label_);
+    op_desc->SetAttr("score_threshold", score_threshold_);
+    op_desc->SetAttr("normalized", normalized_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> bboxes(bboxes_dims_.production());
+    for (int i = 0; i < bboxes_dims_.production(); ++i) {
+      bboxes[i] = i * 1. / bboxes_dims_.production();
+    }
+    SetCommonTensor(bboxes_, bboxes_dims_, bboxes.data());
+
+    std::vector<float> scores(scores_dims_.production());
+    for (int i = 0; i < scores_dims_.production(); ++i) {
+      scores[i] = i * 1. / scores_dims_.production();
+    }
+    SetCommonTensor(scores_, scores_dims_, scores.data());
+  }
+};
+
+void TestMulticlassNms(Place place, float abs_error) {
+  int N = 3;
+  int M = 2500;
+  for (int class_num : {2, 4, 10}) {
+    std::vector<int64_t> bbox_shape{N, M, 4};
+    std::vector<int64_t> score_shape{N, class_num, M};
+    std::unique_ptr<arena::TestCase> tester(new MulticlassNmsComputeTester(
+        place, "def", DDim(bbox_shape), DDim(score_shape)));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+TEST(multiclass_nms, precision) {
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  TestMulticlassNms(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc
index 2e98ce96cef479d55e77acebbe464d9a56f92934..c41c89608fd7496c5b01b1a813581f7f461ff0ee 100644
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
@@ -228,14 +228,14 @@ class YoloBoxComputeTester : public arena::TestCase {
   }
 };
 
-void test_yolobox(Place place) {
-  for (int class_num : {1, 2, 3, 4}) {
-    for (float conf_thresh : {0.01, 0.2, 0.7}) {
+void TestYoloBox(Place place, float abs_error) {
+  for (int class_num : {1, 4}) {
+    for (float conf_thresh : {0.01, 0.2}) {
       for (int downsample_ratio : {16, 32}) {
-        std::vector<int> anchor({10, 13, 16, 30});
+        std::vector<int> anchor{10, 13, 16, 30, 33, 30};
         std::unique_ptr<arena::TestCase> tester(new YoloBoxComputeTester(
             place, "def", anchor, class_num, conf_thresh, downsample_ratio));
-        arena::Arena arena(std::move(tester), place, 2e-5);
+        arena::Arena arena(std::move(tester), place, abs_error);
         arena.TestPrecision();
       }
     }
@@ -243,13 +243,17 @@ void test_yolobox(Place place) {
 }
 
 TEST(YoloBox, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_yolobox(place);
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
 #endif
+
+  TestYoloBox(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 4c74f75c91c7931ffba060eb9180d858bc7d54a7..a888d8ef25cbe7c816693fa45d954672a8ad5b1f 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -14,6 +14,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 
 # global variables
 BUILD_EXTRA=OFF
+BUILD_TRAIN=OFF
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
@@ -226,6 +227,7 @@ function make_full_publish_so {
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+      -DLITE_WITH_TRAIN=$BUILD_TRAIN \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
@@ -388,6 +390,7 @@ function print_usage {
     echo -e "optional argument:"
     echo -e "--shutdown_log: (OFF|ON); controls whether to shutdown log, default is ON"
     echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
+    echo -e "--build_train: (OFF|ON); controls whether to publish training operators and kernels, build_train is only for full_publish library now"
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
     echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)"
     echo -e "--build_dir: directory for building"
@@ -436,6 +439,10 @@ function main {
                 BUILD_EXTRA="${i#*=}"
                 shift
                 ;;
+            --build_train=*)
+                BUILD_TRAIN="${i#*=}"
+                shift
+                ;;
             --build_cv=*)
                 BUILD_CV="${i#*=}"
                 shift
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
index 2c3a8406f7e7c52ecb0268d581e043a3070ba028..5bc8544e9f81d07ed33ece4b33233b08b9227740 100755
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
@@ -5,7 +5,7 @@ set -ex
 BM_SDK_ROOT="$(pwd)/third-party/bmlibs/bm_sc3_libs"     # BM SDK
 TARGET_NAME="BM1682"     # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=ON                    # ON/OFF
+WITH_TESTING=OFF                    # ON/OFF
 
 function print_usage {
     echo -e "\nUSAGE:"
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 703da69fa59f3aa99bad9fb04c0decb591486058..a5dc2b741d2d3d5fdd2f08d13b7dc483a3065b0e 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -192,6 +192,7 @@ function build_opencl {
 
     cmake_opencl ${os} ${abi} ${lang}
     make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
+    make publish_inference -j$NUM_CORES_FOR_COMPILE
     build $TESTS_FILE
 }