Merge pull request #3 from PaddlePaddle/develop

merge paddlelite

Merge pull request #3 from PaddlePaddle/develop
merge paddlelite
34716531 · myq406450149 · GitHub · 34e73be0 · 134c138f · 34716531
281 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,9 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
+# cv build options
+lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)
@@ -181,7 +184,7 @@ include(external/xxhash)    # download install xxhash needed for x86 jit
 include(cudnn)
 include(configure)          # add paddle env configuration
-if(LITE_WITH_CUDA) 
+if(LITE_WITH_CUDA)
  include(cuda)
 endif()

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -117,8 +117,12 @@ endif()
 if (LITE_WITH_ARM)
    add_definitions("-DLITE_WITH_ARM")
+    if (LITE_WITH_CV)
+        add_definitions("-DLITE_WITH_CV")
+    endif()
 endif()
 if (WITH_ARM_DOTPROD)
    add_definitions("-DWITH_ARM_DOTPROD")
 endif()

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -26,7 +26,8 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    ${CUDNN_ROOT}/lib64
    ${CUDNN_ROOT}/lib
    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/lib/${TARGET_ARCH}-linux-gnu/
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
    if(nv_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
-gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
+       gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -43,6 +43,11 @@ function (lite_deps TARGET)
    foreach(var ${lite_deps_ARM_DEPS})
      set(deps ${deps} ${var})
    endforeach(var)
+    if(LITE_WITH_CV)
+      foreach(var ${lite_cv_deps})
+        set(deps ${deps} ${var})
+      endforeach(var)
+    endif()
  endif()
  if(LITE_WITH_PROFILE)
@@ -152,7 +157,9 @@ function(lite_cc_library TARGET)
 endfunction()
 function(lite_cc_binary TARGET)
-    set(options "")
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        set(options " -g ")
+    endif()
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
@@ -241,6 +248,7 @@ endfunction()
 set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
+set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
@@ -341,7 +349,7 @@ function(add_kernel TARGET device level)
          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
        endforeach()
        nv_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS})
-        return() 
+        return()
    endif()
    # the source list will collect for paddle_use_kernel.h code generation.

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
+message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
 set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
@@ -116,6 +117,9 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
 endif()
+if(LITE_WITH_CUDA)
+    add_dependencies(publish_inference paddle_full_api_shared)
+endif(LITE_WITH_CUDA) 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (NOT LITE_ON_TINY_PUBLISH)
        # add cxx lib
@@ -129,6 +133,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin"
                COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                )
            if(NOT IOS)
                #add_dependencies(publish_inference_cxx_lib model_optimize_tool)
@@ -136,15 +141,17 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                add_dependencies(publish_inference_cxx_lib bundle_full_api)
                add_dependencies(publish_inference_cxx_lib bundle_light_api)
                add_dependencies(publish_inference_cxx_lib test_model_bin)
-                if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux") 
+                if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")
                    add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
                    add_dependencies(publish_inference paddle_light_api_shared)
-                    add_custom_command(TARGET publish_inference_cxx_lib 
+                    add_custom_command(TARGET publish_inference_cxx_lib
                          COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib)
                endif()
                add_dependencies(publish_inference publish_inference_cxx_lib)
-                add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
+                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-                        COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+                    add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+                endif()
            endif()
    else()
        if (IOS)
@@ -153,6 +160,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                    COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                    )
            add_dependencies(tiny_publish_lib bundle_light_api)
            add_dependencies(publish_inference tiny_publish_lib)
@@ -164,6 +172,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                    COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                    )
                add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
@@ -183,11 +192,13 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
        )
        add_dependencies(publish_inference_java_lib paddle_lite_jni PaddlePredictor)
        add_dependencies(publish_inference publish_inference_java_lib)
-        add_custom_command(TARGET publish_inference_java_lib POST_BUILD
+        if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-                                   COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/java/so/libpaddle_lite_jni.so)
+            add_custom_command(TARGET publish_inference_java_lib POST_BUILD
+                                       COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/java/so/libpaddle_lite_jni.so)
+        endif()
    endif()
-    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
+    if ((ARM_TARGET_OS STREQUAL "android") AND
            ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
        if (NOT LITE_ON_TINY_PUBLISH)
            # copy
@@ -202,6 +213,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
    set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    add_library(paddle_full_api_shared SHARED "")
    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
       add_dependencies(paddle_full_api_shared xxhash)
       target_link_libraries(paddle_full_api_shared xxhash)
    endif()
+    if(LITE_WITH_CUDA)
+        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
+    endif(LITE_WITH_CUDA) 
    #light api dynamic library
    lite_cc_library(paddle_light_api_shared MODULE
    SRCS light_api_shared.cc
@@ -59,6 +61,7 @@ endif()
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
+message(STATUS "get CUDA kernels ${cuda_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
@@ -76,8 +79,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                    ARM_DEPS ${arm_kernels}
                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
-                    CL_DEPS ${opencl_kenrels}
+                    CL_DEPS ${opencl_kernels}
-                    FPGA_DEPS ${fpga_kenrels})
+                    FPGA_DEPS ${fpga_kernels})
 endif()
 # for light api
@@ -96,8 +99,8 @@ lite_cc_library(light_api SRCS light_api.cc
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
-        CL_DEPS ${opencl_kenrels}
+        CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kenrels})
+        FPGA_DEPS ${fpga_kernels})
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -289,7 +292,8 @@ if(NOT IOS)
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}
    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
    ${ops} ${host_kernels}
    ARM_DEPS ${arm_kernels}
@@ -297,7 +301,9 @@ if(NOT IOS)
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}
    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
 endif()
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
-  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
+  if (!program_) {
-  std::vector<cpp::OpDesc *> feeds;
+    GenRuntimeProgram();
-  std::vector<cpp::OpDesc *> fetchs;
+  }
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
+  std::vector<const cpp::OpDesc *> feeds;
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
+  std::vector<const cpp::OpDesc *> fetchs;
+  const auto &insts = program_->instructions();
+  for (size_t i = 0; i < program_->num_instructions(); i++) {
+    const auto &op = insts[i].op()->op_info();
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
      fetchs.push_back(op);
    }
  }
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
  for (size_t i = 0; i < feeds.size(); i++) {
@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;
 }
 const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
 void Predictor::Build(const lite_api::CxxConfig &config,
@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
  program_desc_ = desc;
+  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  inner_places.emplace_back(
      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
  Program program(desc, scope_, inner_places);
-  /// The first place in valid_places is
  core::KernelPickFactor factor;
  factor.ConsiderTarget();
  factor.ConsiderPrecision();
  factor.ConsiderDataLayout();
  optimizer_.Run(std::move(program), inner_places, factor, passes);
  exec_scope_ = optimizer_.exec_scope();
  PrepareFeedFetch();
@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
  auto *var = exec_scope_->FindVar(name);
  return &var->Get<lite::Tensor>();
 }
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  auto element = std::find(input_names_.begin(), input_names_.end(), name);

--- a/lite/api/detection_model_test.cc
+++ b/lite/api/detection_model_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <fstream>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-DEFINE_bool(is_run_model_optimize,
-            false,
-            "apply model_optimize_tool to model, use optimized model to test");
-namespace paddle {
-namespace lite_api {
-void OutputOptModel(const std::string& load_model_dir,
-                    const std::string& save_optimized_model_dir) {
-  lite_api::CxxConfig config;
-  config.set_model_dir(load_model_dir);
-  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-  int ret = system(
-      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
-          .c_str());
-  if (ret == 0) {
-    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
-  }
-  predictor->SaveOptimizedModel(save_optimized_model_dir,
-                                LiteModelType::kNaiveBuffer);
-  LOG(INFO) << "Load model from " << load_model_dir;
-  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
-}
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void Run(const std::string& model_dir,
-         const int repeat,
-         const int warmup_times,
-         const int thread_num) {
-  // set config and create predictor
-  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
-  config.set_threads(thread_num);
-  if (thread_num == 1) {
-    config.set_power_mode(LITE_POWER_HIGH);
-  } else {
-    config.set_power_mode(LITE_POWER_NO_BIND);
-  }
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-  // set input
-  auto input_image = predictor->GetInput(0);
-  input_image->Resize({1, 3, 300, 300});
-  auto input_image_data = input_image->mutable_data<float>();
-  std::ifstream read_file("/data/local/tmp/pjc/ssd_img.txt");
-  if (!read_file.is_open()) {
-    LOG(INFO) << "read image file fail";
-    return;
-  }
-  auto input_shape = input_image->shape();
-  int64_t input_image_size = 1;
-  for (auto t : input_shape) {
-    input_image_size *= t;
-  }
-  for (int i = 0; i < input_image_size; i++) {
-    read_file >> input_image_data[i];
-  }
-  // warmup and run
-  for (int i = 0; i < warmup_times; ++i) {
-    predictor->Run();
-  }
-  auto start = lite::GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
-    predictor->Run();
-  }
-  // show result
-  auto end = lite::GetCurrentUS();
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (end - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-  auto out = predictor->GetOutput(0);
-  auto out_data = out->data<float>();
-  LOG(INFO) << "output shape:";
-  auto out_shape = out->shape();
-  for (auto t : out_shape) {
-    LOG(INFO) << t;
-  }
-  LOG(INFO) << "output data:";
-  int output_len = 20;
-  for (int i = 0; i < output_len; i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-#endif
-}  // namespace lite_api
-}  // namespace paddle
-TEST(Faster_RCNN, test_arm) {
-  std::string save_optimized_model_dir;
-  if (FLAGS_is_run_model_optimize) {
-    save_optimized_model_dir = FLAGS_model_dir + "opt";
-    paddle::lite_api::OutputOptModel(FLAGS_model_dir, save_optimized_model_dir);
-  }
-  std::string run_model_dir =
-      FLAGS_is_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(
-      run_model_dir, FLAGS_repeats, FLAGS_threads, FLAGS_warmup);
-}
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
  });
  TestModel(valid_places);

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -80,7 +80,16 @@ void Main() {
    if (target_repr == "arm") {
      valid_places.emplace_back(TARGET(kARM));
    } else if (target_repr == "opencl") {
-      valid_places.emplace_back(TARGET(kOpenCL));
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
      valid_places.emplace_back(TARGET(kX86));
    } else {

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -202,7 +202,8 @@ void conv1x1s1_gemm(const float* i_data,
              k,
              flag_bias,
              bias_group,
-              flag_relu);
+              flag_relu,
+              ctx);
      } else {
        sgemm_prepack(false,
                      m,
@@ -395,7 +396,8 @@ void conv_im2col_gemm(const float* i_data,
              k,
              flag_bias,
              bias_group,
-              flag_relu);
+              flag_relu,
+              ctx);
      } else {
        int ldb = n;
        sgemm_prepack(false,

--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -22,6 +22,28 @@ namespace lite {
 namespace arm {
 namespace math {
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+  return vec_new_shape;
+}
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
 // The following function bilinear_interp is partially base on
 // https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp
 // Tencent is pleased to support the open source community by making ncnn
@@ -472,33 +494,52 @@ void nearest_interp(const float* src,
 void interpolate(lite::Tensor* X,
                 lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
                 lite::Tensor* Out,
                 int out_height,
                 int out_width,
-                 float height_scale,
+                 float scale,
-                 float width_scale,
                 bool with_align,
                 std::string interpolate_type) {
+  int in_h = X->dims()[2];
+  int in_w = X->dims()[3];
+  if (SizeTensor.size() > 0) {
+    auto new_size = get_new_shape(SizeTensor);
+    out_height = new_size[0];
+    out_width = new_size[1];
+  } else {
+    auto scale_tensor = Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_height = static_cast<int>(in_h * scale);
+      out_width = static_cast<int>(in_w * scale);
+    }
+    auto out_size = OutSize;
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<float>(out_size);
+      out_height = static_cast<int>(out_size_data[0]);
+      out_width = static_cast<int>(out_size_data[1]);
+    }
+  }
+  float height_scale = scale;
+  float width_scale = scale;
  if (out_width > 0 && out_height > 0) {
    height_scale = static_cast<float>(out_height / X->dims()[2]);
    width_scale = static_cast<float>(out_width / X->dims()[3]);
  }
-  if (OutSize != nullptr) {
+  int num_cout = X->dims()[0];
-    auto OutSize_data = OutSize->data<int>();
+  int c_cout = X->dims()[1];
-    int h_out = OutSize_data[0];  // HW
+  Out->Resize({num_cout, c_cout, out_height, out_width});
-    int w_out = OutSize_data[1];  // HW
-    int num_cout = Out->dims()[0];
-    int c_cout = Out->dims()[1];
-    Out->Resize({num_cout, c_cout, h_out, w_out});
-  }
  float* dout = Out->mutable_data<float>();
  const float* din = X->data<float>();
  int out_num = Out->dims()[0];
  int out_c = Out->dims()[1];
  int count = out_num * out_c;
-  int in_h = X->dims()[2];
-  int in_w = X->dims()[3];
  int out_h = Out->dims()[2];
  int out_w = Out->dims()[3];
  int spatial_in = in_h * in_w;

--- a/lite/backends/arm/math/interpolate.h
+++ b/lite/backends/arm/math/interpolate.h
@@ -44,11 +44,12 @@ void nearest_interp(const float* src,
 void interpolate(lite::Tensor* X,
                 lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
                 lite::Tensor* Out,
                 int out_height,
                 int out_width,
-                 float height_scale,
+                 float scale,
-                 float width_scale,
                 bool with_align,
                 std::string interpolate_type);

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -116,6 +116,27 @@ void pooling3x3s2p1_max(const float* din,
                        int hin,
                        int win);
+void pooling3x3s1p0_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win);
+void pooling3x3s1p0_avg(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win,
+                        bool exclusive);
 void pooling3x3s2p1_avg(const float* din,
                        float* dout,
                        int num,

--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
--- a/lite/backends/arm/math/sgemv.h
+++ b/lite/backends/arm/math/sgemv.h
@@ -15,6 +15,8 @@
 #pragma once
 #include <cmath>
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
 namespace paddle {
 namespace lite {
@@ -28,9 +30,10 @@ bool sgemv(const float* A,
           bool transA,
           int M,
           int N,
-           bool is_bias = false,
+           bool is_bias,
-           const float* bias = nullptr,
+           const float* bias,
-           bool is_relu = false);
+           bool is_relu,
+           const ARMContext* ctx);
 }  // namespace math
 }  // namespace arm

--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -56,6 +56,15 @@
    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \
  }
+const int CUDA_NUM_THREADS = 512;
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+inline int CUDA_GET_BLOCKS(const int N, const int base) {
+  return (N + base - 1) / base;
+}
 namespace paddle {
 namespace lite {
 namespace cuda {

--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -12,6 +12,8 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
 cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
+nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
+nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 set (
 math_cuda
@@ -21,6 +23,8 @@ set (
 cuda_type_trans
 cuda_transpose
 cuda_elementwise
+ cuda_gemm
+ cuda_batched_gemm
 )
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
--- a/lite/backends/cuda/math/batched_gemm.cc
+++ b/lite/backends/cuda/math/batched_gemm.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/math/batched_gemm.h"
+#include <iostream>
+#include "lite/core/device_info.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+template <>
+bool BatchedGemm<float, float>::init(const bool trans_a,
+                                     const bool trans_b,
+                                     const int max_batch_size,
+                                     Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cudaMalloc(reinterpret_cast<void **>(&A_),
+             3 * max_batch_size * sizeof(float *));
+  return true;
+}
+template <>
+bool BatchedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const float *a[],
+                                    const float *b[],
+                                    float *c[],
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const int batch_size) {
+  CHECK(a != nullptr);
+  CHECK(b != nullptr);
+  CHECK(c != nullptr);
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cudaMemcpyAsync(A_,
+                  a,
+                  batch_size * sizeof(const float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  cudaMemcpyAsync(A_ + batch_size,
+                  b,
+                  batch_size * sizeof(const float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  cudaMemcpyAsync(A_ + batch_size * 2,
+                  c,
+                  batch_size * sizeof(float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
+                                 cu_trans_b_,
+                                 cu_trans_a_,
+                                 n_,
+                                 m_,
+                                 k_,
+                                 &alpha,
+                                 const_cast<const float **>(A_ + batch_size),
+                                 ldb_,
+                                 const_cast<const float **>(A_),
+                                 lda_,
+                                 &beta,
+                                 A_ + batch_size * 2,
+                                 ldc_,
+                                 batch_size));
+  return true;
+}
+template <>
+bool BatchedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const float *a[],
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const int batch_size) {
+  CHECK(a != nullptr);
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cudaMemcpyAsync(A_,
+                  a,
+                  3 * batch_size * sizeof(const float *),
+                  cudaMemcpyDefault,
+                  exe_stream_);
+  CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
+                                 cu_trans_b_,
+                                 cu_trans_a_,
+                                 n_,
+                                 m_,
+                                 k_,
+                                 &alpha,
+                                 const_cast<const float **>(A_ + batch_size),
+                                 ldb_,
+                                 const_cast<const float **>(A_),
+                                 lda_,
+                                 &beta,
+                                 A_ + batch_size * 2,
+                                 ldc_,
+                                 batch_size));
+  return true;
+}
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/batched_gemm.h
+++ b/lite/backends/cuda/math/batched_gemm.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+template <typename PtypeIn, typename PtypeOut>
+class BatchedGemm {
+ public:
+  BatchedGemm() : cu_handle_(nullptr) {}
+  ~BatchedGemm() {
+    if (A_ != nullptr) {
+      cudaFree(A_);
+    }
+  }
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int max_batch_size,
+            Context<TARGET(kCUDA)>* ctx);
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a[],
+           const PtypeIn* b[],
+           PtypeOut* c[],
+           const int m,
+           const int n,
+           const int k,
+           const int batch_size);
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a[],
+           const int m,
+           const int n,
+           const int k,
+           const int batch_size);
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+  PtypeIn** A_{nullptr};
+};
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/gemm.cc
+++ b/lite/backends/cuda/math/gemm.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/math/gemm.h"
+#include <iostream>
+#include "lite/core/device_info.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+template <>
+bool Gemm<float, float>::init(const bool trans_a,
+                              bool trans_b,
+                              const int m,
+                              const int n,
+                              const int k,
+                              Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  lda_ = (!trans_a) ? k : m;
+  ldb_ = (!trans_b) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+template <>
+bool Gemm<float, float>::init(const bool trans_a,
+                              bool trans_b,
+                              const int m,
+                              const int n,
+                              const int k,
+                              const int lda,
+                              const int ldb,
+                              const int ldc,
+                              Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  lda_ = lda;
+  ldb_ = ldb;
+  ldc_ = ldc;
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+template <>
+bool Gemm<float, float>::run(const float alpha,
+                             const float beta,
+                             const float *a,
+                             const float *b,
+                             float *c,
+                             Context<TARGET(kCUDA)> *ctx) {
+  CUBLAS_CALL(cublasSgemm(cu_handle_,
+                          cu_trans_b_,
+                          cu_trans_a_,
+                          n_,
+                          m_,
+                          k_,
+                          &alpha,
+                          b,
+                          ldb_,
+                          a,
+                          lda_,
+                          &beta,
+                          c,
+                          ldc_));
+  return true;
+}
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/gemm.h
+++ b/lite/backends/cuda/math/gemm.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+template <typename PtypeIn, typename PtypeOut>
+class Gemm {
+ public:
+  Gemm() : cu_handle_(nullptr) {}
+  ~Gemm() {}
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int m,
+            const int n,
+            const int k,
+            Context<TARGET(kCUDA)>* ctx);
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int m,
+            const int n,
+            const int k,
+            const int lda,
+            const int ldb,
+            const int ldc,
+            Context<TARGET(kCUDA)>* ctx);
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a,
+           const PtypeIn* b,
+           PtypeOut* c,
+           Context<TARGET(kCUDA)>* ctx);
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+};
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/npu/builder.cc
+++ b/lite/backends/npu/builder.cc
@@ -67,7 +67,7 @@ std::string UniqueName(const std::string& prefix) {
  return prefix + "_" + std::to_string(counter);
 }
-ge::DataType PrecisionConverter(PrecisionType itype) {
+ge::DataType CvtPrecisionType(PrecisionType itype) {
  ge::DataType otype = ge::DT_FLOAT;
  switch (itype) {
    case PRECISION(kFloat):
@@ -80,14 +80,14 @@ ge::DataType PrecisionConverter(PrecisionType itype) {
      otype = ge::DT_INT32;
      break;
    default:
-      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(itype)
+      LOG(FATAL) << "[NPU] Can not convert precision type("
-                 << ") from Lite to NPU";
+                 << PrecisionToStr(itype) << ") from Lite to NPU";
      break;
  }
  return otype;
 }
-ge::Format DataLayoutConverter(DataLayoutType itype) {
+ge::Format CvtDataLayoutType(DataLayoutType itype) {
  ge::Format otype = ge::FORMAT_NCHW;
  switch (itype) {
    case DATALAYOUT(kNCHW):
@@ -95,17 +95,17 @@ ge::Format DataLayoutConverter(DataLayoutType itype) {
      break;
    // TODO(hong19860320) support more data layout type
    default:
-      LOG(FATAL) << "Can not convert data layout type("
+      LOG(FATAL) << "[NPU] Can not convert data layout type("
                 << DataLayoutToStr(itype) << ") from Lite to NPU";
      break;
  }
  return otype;
 }
-ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
+ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
-                                std::vector<int64_t> out_shape,
+                        std::vector<int64_t> out_shape,
-                                PrecisionType in_ptype,
+                        PrecisionType in_ptype,
-                                DataLayoutType in_ltype) {
+                        DataLayoutType in_ltype) {
  uint8_t* in_data = nullptr;
  auto in_size = in_tensor->dims().production();
  auto in_shape = in_tensor->dims().Vectorize();
@@ -123,10 +123,10 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
    in_bytes = in_size * sizeof(int8_t);
  } else {
-    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
+    LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
  }
-  ge::DataType out_ptype = PrecisionConverter(in_ptype);
+  ge::DataType out_ptype = CvtPrecisionType(in_ptype);
-  ge::Format out_ltype = DataLayoutConverter(in_ltype);
+  ge::Format out_ltype = CvtDataLayoutType(in_ltype);
  ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype);
  CHECK_EQ(out_ltype, ge::FORMAT_NCHW);
@@ -140,6 +140,31 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
  return out_tensor;
 }
+int CvtActMode(std::string act_type) {
+  int act_mode = 1;
+  if (act_type == "sigmod") {
+    act_mode = 0;
+  } else if (act_type == "relu") {
+    act_mode = 1;
+  } else if (act_type == "tanh") {
+    act_mode = 2;
+  } else if (act_type == "elu") {
+    act_mode = 4;
+  } else if (act_type == "abs") {
+    act_mode = 6;
+  } else if (act_type == "softsign") {
+    act_mode = 8;
+  } else if (act_type == "softplus") {
+    act_mode = 9;
+  } else if (act_type == "hardsigmoid") {
+    act_mode = 10;
+  } else {
+    // TODO(hong19860320) support more activation mode
+    LOG(FATAL) << "[NPU] Unsupported activation type " << act_type;
+  }
+  return act_mode;
+}
 bool HasInputArg(const OpInfo* op_info,
                 const Scope* scope,
                 const std::string& argname) {

--- a/lite/backends/npu/builder.h
+++ b/lite/backends/npu/builder.h
@@ -31,117 +31,6 @@
 // Extended Ops of HIAI DDK
 namespace ge {
-/**
- * Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
- * The inputs must be two-dimensional matrices and the inner dimension of "x1"
- * (after being transposed if transpose_x1 is true) must match the outer
- * dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
- *      x : the first input tensor, must be non const op.
- *      w : the second input tensor, must be const op.
- *      bias: the optional bias tensor, must be const op.
- * <Output>
- *      y : the output tensor.
- * <Attr>
- *      has_bias: If true, enable input bias.
- */
-REG_OP(MatMul)
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(w, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT}))  // bias must be const input
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(has_bias, AttrValue::BOOL{false})  // when has input::bias,set true
-    .OP_END();
-/**
- * Computes the gradients of convolution with respect to the input.
- * <Input>
- *      input_sizes : An integer vector representing the shape of input,
- * where input is a 4-D [batch, height, width, channels] tensor.
- *      filter : the filter tensor, with shape [H , W, filter_channel,
- * filter_number], filter_channel must be same as x channel.
- *      x :  The input tensor.
- * <Output>
- *      y : The output tensor.
- * <Attr>
- *      format: 0: NCHW. 1: NHWC
- *      group : 1: default
- *      num_output : 0: default, num_output must be equal to
- * (filter_channel * group)
- *      pad : Padding for the beginning and ending along each axis
- *      stride : Stride along each axis.
- *      dilation : dilation value along each axis of the filter.
- *      pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
- *      bias_term : 0: default
- *      kernel : The shape of the convolution kernel
- */
-REG_OP(Deconvolution)
-    .INPUT(input_sizes, TensorType({DT_UINT8}))
-    .INPUT(filter, TensorType({DT_FLOAT}))
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(mode, AttrValue::INT{1})
-    .ATTR(format, AttrValue::INT{1})
-    .ATTR(group, AttrValue::INT{1})
-    .ATTR(num_output, AttrValue::INT{0})
-    .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
-    .ATTR(stride, AttrValue::LIST_INT({1, 1}))
-    .ATTR(dilation, AttrValue::LIST_INT({1, 1}))
-    .ATTR(pad_mode, AttrValue::INT{0})
-    .ATTR(bias_term, AttrValue::INT{0})
-    .ATTR(kernel, AttrValue::LIST_INT({0, 0}))
-    .OP_END();
-/**
- * Resize images to size using bilinear interpolation.
- * <Input>
- *      x : The tensor of 4-D
- *      w : A int32 Tensor of 2 elements: [height, width].
- * <Output>
- *      y : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels.
- *      output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
- * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
- * controled by the [height, width] of w.
- *      shrink_factor : shrink factor.
- *      zoom_factor : zoom factor.
- *      pad_begin : begin of pad.
- *      pad_end : end of pad.
- */
-REG_OP(ResizeBilinear)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
-    .INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .ATTR(output_dim_mode, AttrValue::INT{2})
-    .ATTR(shrink_factor, AttrValue::INT{1})
-    .ATTR(zoom_factor, AttrValue::INT{1})
-    .ATTR(pad_begin, AttrValue::INT{0})
-    .ATTR(pad_end, AttrValue::INT{0})
-    .OP_END();
-/**
- * Resize images to size using nearest neighbor interpolation.
- * <Input>
- *      image : Resize images to size using nearest neighbor interpolation.
- *      size : Must be one dimension and two  elements
- * <Output>
- *      output : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels. Defaults to false
- */
-REG_OP(ResizeNearestNeighbor)
-    .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .INPUT(size, TensorType({DT_INT32}))
-    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .OP_END();
 /**
 * Pads a tensor.
 * <Input>
@@ -192,14 +81,14 @@ bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
 std::string UniqueName(const std::string& prefix);
-ge::DataType PrecisionConverter(PrecisionType itype);
+ge::DataType CvtPrecisionType(PrecisionType itype);
-ge::Format DataLayoutConverter(DataLayoutType itype);
+ge::Format CvtDataLayoutType(DataLayoutType itype);
-ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor,
+ge::TensorPtr CvtTensor(Tensor* in_tensor,
-                                std::vector<int64_t> out_shape = {},
+                        std::vector<int64_t> out_shape = {},
-                                PrecisionType in_ptype = PRECISION(kFloat),
+                        PrecisionType in_ptype = PRECISION(kFloat),
-                                DataLayoutType in_ltype = DATALAYOUT(kNCHW));
+                        DataLayoutType in_ltype = DATALAYOUT(kNCHW));
 template <typename T>
 ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
@@ -214,7 +103,7 @@ ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
  } else if (info == typeid(int32_t)) {
    type = ge::DT_INT32;
  } else {
-    LOG(FATAL) << "Unknow value type " << info.name();
+    LOG(FATAL) << "[NPU] Unknow value type " << info.name();
  }
  if (shape.empty()) {
    shape = {static_cast<int64_t>(data.size())};
@@ -245,6 +134,8 @@ ge::TensorPtr CreateTensorAndFillData(T value,
  return CreateTensorAndFillData(data, shape, format);
 }
+int CvtActMode(std::string act_type);
 bool HasInputArg(const OpInfo* op_info,
                 const Scope* scope,
                 const std::string& argname);

--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
  do {                                                               \
    cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
    if (cl_func##_ == nullptr) {                                     \
-      LOG(ERROR) << "Cannot find the " << #cl_func                   \
+      LOG(FATAL) << "Cannot find the " << #cl_func                   \
                 << " symbol in libOpenCL.so!";                      \
      break;                                                         \
    }                                                                \

--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -39,7 +39,12 @@ void MatMulJitCode::genCode() {
  size_t wgt_offset = 0;
  for (size_t g = 0; g < groups.size(); ++g) {
    size_t x_offset = 0;
+    size_t wgt_offset_tmp = 0;
+    for (int i = 0; i < g; ++i) {
+      wgt_offset_tmp += groups[i] * block_len;
+    }
    for (int k = 0; k < k_; ++k) {
+      wgt_offset = wgt_offset_tmp;
      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
      // clean
      if (k == 0) {
@@ -48,7 +53,8 @@ void MatMulJitCode::genCode() {
        }
      }
      for (int i = 0; i < groups[g]; ++i) {
-        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
+        vmovups(zmm_t(w_reg_idx),
+                ptr[reg_ptr_wgt + wgt_offset + k * n_ * sizeof(float)]);
        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
        wgt_offset += block_len;
      }

--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -50,7 +50,8 @@ math_library(unpooling)
 math_library(vol2col)
 ## math_library(prelu)
 math_library(tree2col DEPS math_function)
+math_library(sequence_topk_avg_pooling)
+math_library(search_fc DEPS blas dynload_mklml)
 # cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 # cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 # cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)

--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/backends/x86/math/search_fc.h"
+#include <algorithm>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SearchFcFunctor<lite::TargetType::kX86, T> {
+ public:
+  void operator()(const lite::X86Context& context,
+                  const lite::Tensor& bottom,
+                  const lite::Tensor& w,
+                  const lite::Tensor& b,
+                  lite::Tensor* top,
+                  int out_size) {
+    int batch = bottom.dims()[0];
+    int _out = w.dims()[0];  // 100
+    int _in = w.dims()[1];   // 228
+    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
+    const auto bottom_data = bottom.data<T>();
+    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    const auto weights = w.data<T>();
+    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
+    call_gemm<lite::X86Context, T>(blas,
+                                   CblasNoTrans,
+                                   CblasTrans,
+                                   batch,
+                                   _out,
+                                   _in,
+                                   1.0f,
+                                   bottom_data,
+                                   weights,
+                                   0.0f,
+                                   top_data);
+    if (true) {
+      const auto* bias_data = b.data<T>();
+      for (int i = 0; i < batch; ++i) {
+        // add bias here
+        sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out);
+      }
+    }
+  }
+  // private:
+};
+#define DEFINE_FUNCTOR(type) \
+  template class SearchFcFunctor<lite::TargetType::kX86, type>;
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/search_fc.h
+++ b/lite/backends/x86/math/search_fc.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/mklml.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <typename DeviceContext, typename T>
+void call_gemm(const BlasT<lite::TargetType::kX86, T> blas,
+               const CBLAS_TRANSPOSE TransA,
+               const CBLAS_TRANSPOSE TransB,
+               const int M,
+               const int N,
+               const int K,
+               const T alpha,
+               const T* A,
+               const T* B,
+               const T beta,
+               T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans),
+              (TransB == CblasTrans),
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              B,
+              beta,
+              C);
+#endif  // !__NAIVE_GEMM__
+}
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+#if defined(LEGO_USE_FLOAT)
+#define __m256x __m256
+#define __m128x __m128
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+#elif defined(LEGO_USE_DOUBLE)
+#define __m256x __m256d
+#define __m128x __m128d
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(
+        z + jjj,
+        _mm256_add_px(_mm256_load_px(x + jjj), _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+template <lite::TargetType Target, typename T>
+class SearchFcFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& X,
+                  const lite::Tensor& W,
+                  const lite::Tensor& b,
+                  lite::Tensor* Out,
+                  int out_size);
+};
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
+#define FOR_ALL_TYPES(macro) macro(float);
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/backends/x86/math/sequence_topk_avg_pooling.h"
+#include <algorithm>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <typename T>
+void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) {
+  size_t real_k = k < length ? k : length;
+  std::vector<T> v(data, data + length);
+  std::vector<int> topk_pos;
+  T min_val = -10000000.0;
+  while (topk_pos.size() < real_k) {
+    T max_val = min_val;
+    int max_pos = -1;
+    for (int i = 0; i < length; ++i) {
+      if (v[i] > max_val) {
+        max_pos = i;
+        max_val = v[i];
+      }
+    }
+    assert(max_pos >= 0);
+    topk_pos.push_back(max_pos);
+    v[max_pos] = min_val;
+  }
+  assert(topk_pos.size() > 0);
+  while (topk_pos.size() < (size_t)k) {
+    topk_pos.push_back(-1);
+  }
+  for (size_t i = 0; i < topk_pos.size(); ++i) {
+    pos[i] = topk_pos[i];
+  }
+}
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
+ public:
+  void operator()(const lite::Tensor& in,
+                  const lite::Tensor& row,
+                  const lite::Tensor& col,
+                  lite::Tensor* out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks) {
+    auto k_num = topks.size();
+    auto max_k = topks[topks.size() - 1];
+    std::vector<int64_t> vec_pos_shape;
+    auto in_lod = in.lod()[0];
+    auto row_lod = row.lod()[0];
+    auto col_lod = col.lod()[0];
+    int batch_size = row_lod.size() - 1;
+    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+    vec_pos_shape.push_back(pos_total_size);
+    lite::DDim dims(vec_pos_shape);
+    pos->Resize(dims);
+    auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
+    int offset = 0;
+    std::vector<size_t> vec_out_lod;
+    vec_out_lod.reserve(batch_size + 1);
+    for (int i = 0; i <= batch_size; ++i) {
+      offset = row_lod[i];
+      vec_out_lod.push_back(offset);
+    }
+    lite::LoD lod_temp;
+    lod_temp.push_back(vec_out_lod);
+    out->set_lod(lod_temp);
+    auto in_data = in.data<T>();
+    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+    T* sum_data = new T[max_k];
+    for (int i = 0; i < batch_size; ++i) {
+      int total_size = in_lod[i + 1] - in_lod[i];
+      int row_size = row_lod[i + 1] - row_lod[i];
+      int col_size = col_lod[i + 1] - col_lod[i];
+      CHECK_EQ(total_size, channel_num * row_size * col_size)
+          << "size wrong in sequence_topk_avg_pooling_op!";
+      int feature_num = row_size * col_size;
+      for (int j = 0; j < channel_num; ++j) {
+        auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num;
+        for (int r = 0; r < row_size; ++r) {
+          auto row_data = input_offset_feature_data + r * col_size;
+          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
+                                r * channel_num * max_k + j * max_k;
+          auto out_slice_data = out_data + row_lod[i] * channel_num * k_num +
+                                r * channel_num * k_num + j * k_num;
+          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == -1) {
+            sum_data[0] = 0.0;
+          } else {
+            sum_data[0] = row_data[pos_slice_data[0]];
+          }
+          for (int k = 1; k < max_k; ++k) {
+            if (pos_slice_data[k] == -1) {
+              sum_data[k] = sum_data[k - 1];
+            } else {
+              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
+            }
+          }
+          for (size_t k = 0; k < k_num; ++k) {
+            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
+          }
+        }
+      }
+    }
+    delete[] sum_data;
+  }
+};
+#define DEFINE_FUNCTOR(type) \
+  template class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, type>;
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.h
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <typename T>
+void get_topk_pos(
+    const T* data, int length, int k, int* pos, bool debug = false);
+template <lite::TargetType Target, typename T>
+class SequenceTopkAvgPoolingFunctor {
+ public:
+  void operator()(const lite::Tensor& X,
+                  const lite::Tensor& ROW,
+                  const lite::Tensor& COLUMN,
+                  lite::Tensor* Out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks);
+};
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
+#define FOR_ALL_TYPES(macro) macro(float);
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -207,6 +207,13 @@ class Context<TargetType::kCUDA> {
    ctx->cublas_fp32_ = cublas_fp32_;
  }
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    this->cublas_fp32_ = context.cublas_fp32_;
+    return *this;
+  }
  const cudaStream_t& exec_stream() const { return exec_stream_; }
  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
@@ -232,6 +239,13 @@ class Context<TargetType::kCUDA> {
  std::string name() const { return "CUDAContext"; }
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
+    return *this;
+  }
 private:
  int device_id_;
  // overall information

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -59,6 +59,12 @@ namespace paddle {
 namespace lite {
 #ifdef LITE_WITH_ARM
+thread_local lite_api::PowerMode DeviceInfo::mode_;
+thread_local ARMArch DeviceInfo::arch_;
+thread_local int DeviceInfo::mem_size_;
+thread_local std::vector<int> DeviceInfo::active_ids_;
+thread_local TensorLite DeviceInfo::workspace_;
+thread_local int64_t DeviceInfo::count_ = 0;
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
@@ -1033,7 +1039,7 @@ int DeviceInfo::Setup() {
              << ", max freq: " << max_freqs_[i]
              << ", min freq: " << min_freqs_[i]
              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
-              << ", CPU ARCH: A" << archs_[i];
+              << ", CPU ARCH: A" << static_cast<int>(archs_[i]);
  }
  LOG(INFO) << "L1 DataCache size is: ";
  for (int i = 0; i < core_num_; ++i) {
@@ -1087,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
      RequestPowerRandLowMode(shift_num, thread_num);
      break;
    default:
-      LOG(FATAL) << "Unsupported power mode: " << mode;
+      LOG(FATAL) << "Unsupported power mode: " << static_cast<int>(mode);
      break;
  }
  if (active_ids_.empty()) {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -79,7 +79,6 @@ class DeviceInfo {
  int core_num_;
  std::vector<int> max_freqs_;
  std::vector<int> min_freqs_;
-  int mem_size_;
  std::string dev_name_;
  std::vector<int> L1_cache_;
@@ -94,14 +93,15 @@ class DeviceInfo {
  std::vector<bool> fp16_;
  std::vector<bool> dot_;
-  ARMArch arch_;
  // LITE_POWER_HIGH stands for using big cores,
  // LITE_POWER_LOW stands for using small core,
  // LITE_POWER_FULL stands for using all cores
-  lite_api::PowerMode mode_;
+  static thread_local lite_api::PowerMode mode_;
-  std::vector<int> active_ids_;
+  static thread_local ARMArch arch_;
-  TensorLite workspace_;
+  static thread_local int mem_size_;
-  int64_t count_{0};
+  static thread_local std::vector<int> active_ids_;
+  static thread_local TensorLite workspace_;
+  static thread_local int64_t count_;
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
@@ -119,7 +119,6 @@ class DeviceInfo {
  DeviceInfo() = default;
 };
 #endif  // LITE_WITH_ARM
 template <TargetType Type>

--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -34,13 +34,16 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  }
  // fuse quantized node and dequant node
-  std::vector<std::string> quantized_op_types = {
+  for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
-      "conv2d", "mul", "depthwise_conv2d"};
-  for (auto& op_type : quantized_op_types) {
    fusion::DequantOpFuser fuser(op_type);
    fuser(graph.get());
  }
+  for (auto& op_type : {"conv2d", "depthwise_conv2d"}) {
+    fusion::ChannelWiseDequantOpFuser fuser(op_type);
+    fuser(graph.get());
+  }
  // delete quant_dequant_node
  for (auto op_type : {"pool2d", "elementwise_add"}) {
    fusion::DeleteQuantDequantOpFuser fuser(op_type);

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -79,23 +79,26 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 void DequantOpFuser::BuildPattern() {
  std::string weight_name = "";
-  if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+  if (quantized_op_type_ == "conv2d" ||
+      quantized_op_type_ == "depthwise_conv2d") {
    weight_name = "Filter";
  } else {
    weight_name = "Y";
  }
-  auto* quantized_op_input =
+  auto* quantized_op_input = VarNode("quantized_op_input")
-      VarNode("quantized_op_input")->assert_is_op_input(op_type_)->AsInput();
+                                 ->assert_is_op_input(quantized_op_type_)
-  auto* quantized_op_weight = VarNode("quantized_op_weight")
+                                 ->AsInput();
-                                  ->assert_is_op_input(op_type_, weight_name)
+  auto* quantized_op_weight =
-                                  ->AsInput();
+      VarNode("quantized_op_weight")
-  auto* quantized_op = OpNode("quantized_op", op_type_)
+          ->assert_is_op_input(quantized_op_type_, weight_name)
-                           ->assert_is_op(op_type_)
+          ->AsInput();
+  auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
+                           ->assert_is_op(quantized_op_type_)
                           ->AsIntermediate();
  auto* quantized_op_out =
      VarNode("quantized_op_out")
-          ->assert_is_op_output(op_type_)
+          ->assert_is_op_output(quantized_op_type_)
          ->assert_is_op_input("fake_dequantize_max_abs", "X")
          ->AsIntermediate();
  auto* dequant_op = OpNode("dequant_op", "fake_dequantize_max_abs")
@@ -110,12 +113,13 @@ void DequantOpFuser::BuildPattern() {
  quantized_op_out->LinksFrom({quantized_op});
  dequant_op->LinksFrom({quantized_op_out});
  dequant_op_out->LinksFrom({dequant_op});
-  VLOG(4) << "DeQuantOpFuser BuildPattern op_type:" << op_type_;
+  VLOG(4) << "DeQuantOpFuser BuildPattern op_type:" << quantized_op_type_;
 }
 void DequantOpFuser::InsertNewNode(SSAGraph* graph,
                                   const key2nodes_t& matched) {
-  auto* quant_op_input = matched.at("quantized_op_input");
+  auto* quantized_op_input = matched.at("quantized_op_input");
  auto* quantized_op_weight = matched.at("quantized_op_weight");
  auto* quantized_op = matched.at("quantized_op");
  auto* dequant_op = matched.at("dequant_op");
@@ -142,14 +146,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
      scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
  std::vector<float> weight_scale;
  int weight_scale_size;
-  if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+  if (quantized_op_type_ == "conv2d" ||
-    op_desc.SetInput("Input", {quant_op_input->arg()->name});
+      quantized_op_type_ == "depthwise_conv2d") {
+    op_desc.SetInput("Input", {quantized_op_input->arg()->name});
    op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
    // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
    // be Cout.
    weight_scale_size = quantized_weight_t->dims()[0];
-  } else if (op_type_ == "mul") {
+  } else if (quantized_op_type_ == "mul") {
-    op_desc.SetInput("X", {quant_op_input->arg()->name});
+    op_desc.SetInput("X", {quantized_op_input->arg()->name});
    op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
    // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
    weight_scale_size = quantized_weight_t->dims()[1];
@@ -174,11 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
  quantized_weight_t->set_precision(PRECISION(kInt8));
  // new op and relink nodes
-  auto new_quantized_op = LiteOpRegistry::Global().Create(op_type_);
+  auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
  new_quantized_op->Attach(op_desc, scope);
  auto* new_quantized_op_node =
      graph->GraphCreateInstructNode(new_quantized_op, valid_places);
-  IR_NODE_LINK_TO(quant_op_input, new_quantized_op_node);
+  IR_NODE_LINK_TO(quantized_op_input, new_quantized_op_node);
  IR_NODE_LINK_TO(quantized_op_weight, new_quantized_op_node);
  IR_NODE_LINK_TO(new_quantized_op_node, dequant_op_out);
 }
@@ -188,6 +193,107 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
  return op_desc;
 }
+void ChannelWiseDequantOpFuser::BuildPattern() {
+  std::string dequant_op_type = "fake_channel_wise_dequantize_max_abs";
+  auto* quantized_op_input = VarNode("quantized_op_input")
+                                 ->assert_is_op_input(quantized_op_type_)
+                                 ->AsInput();
+  auto* quantized_op_weight =
+      VarNode("quantized_op_weight")
+          ->assert_is_op_input(quantized_op_type_, "Filter")
+          ->AsInput();
+  auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
+                           ->assert_is_op(quantized_op_type_)
+                           ->AsIntermediate();
+  auto* quantized_op_out = VarNode("quantized_op_out")
+                               ->assert_is_op_output(quantized_op_type_)
+                               ->assert_is_op_input(dequant_op_type, "X")
+                               ->AsIntermediate();
+  auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale")
+                                       ->assert_is_op_input(dequant_op_type)
+                                       ->AsIntermediate();
+  auto* dequant_op = OpNode("dequant_op", dequant_op_type)
+                         ->assert_is_op(dequant_op_type)
+                         ->AsIntermediate();
+  auto* dequant_op_out = VarNode("dequant_op_out")
+                             ->assert_is_op_output(dequant_op_type, "Out")
+                             ->AsOutput();
+  quantized_op->LinksFrom({quantized_op_input, quantized_op_weight});
+  quantized_op_out->LinksFrom({quantized_op});
+  dequant_op->LinksFrom({quantized_op_out, dequant_op_channel_scale});
+  dequant_op_out->LinksFrom({dequant_op});
+  VLOG(4) << "ChannelWiseDequantOpFuser BuildPattern op_type:"
+          << quantized_op_type_;
+}
+void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
+                                              const key2nodes_t& matched) {
+  auto* quantized_op_input = matched.at("quantized_op_input");
+  auto* quantized_op_weight = matched.at("quantized_op_weight");
+  auto* quantized_op = matched.at("quantized_op");
+  auto* dequant_op_channel_scale = matched.at("dequant_op_channel_scale");
+  auto* dequant_op = matched.at("dequant_op");
+  auto* dequant_op_out = matched.at("dequant_op_out");
+  // obtain input_scale and weight_scale
+  auto* scope = quantized_op->stmt()->op()->scope();
+  auto& valid_places = quantized_op->stmt()->op()->valid_places();
+  float input_scale =
+      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
+  std::vector<float> weight_scale;
+  std::vector<int> quant_bits =
+      dequant_op->stmt()->op_info()->GetAttr<std::vector<int>>("quant_bits");
+  int weight_bit_length = quant_bits[0];
+  int range = ((1 << (weight_bit_length - 1)) - 1);
+  auto channel_scale_name = dequant_op_channel_scale->arg()->name;
+  auto channel_scale_tensor =
+      scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
+  auto* channel_scale_data = channel_scale_tensor->data<float>();
+  for (int i = 0; i < channel_scale_tensor->data_size(); i++) {
+    weight_scale.push_back(channel_scale_data[i] / range);
+  }
+  // set op desc
+  cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
+  op_desc.SetInput("Input", {quantized_op_input->arg()->name});
+  op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
+  op_desc.SetAttr("enable_int8", true);
+  op_desc.SetAttr("input_scale", input_scale);
+  op_desc.SetAttr("weight_scale", weight_scale);
+  // change the weight from the float type to int8 type.
+  auto quantized_weight_var_name = quantized_op_weight->arg()->name;
+  auto quantized_weight_t =
+      scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
+  Tensor temp_tensor;
+  temp_tensor.CopyDataFrom(*quantized_weight_t);
+  float* temp_data = temp_tensor.mutable_data<float>();
+  int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
+  for (size_t i = 0; i < quantized_weight_t->data_size(); i++) {
+    quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
+  }
+  quantized_weight_t->set_persistable(true);
+  quantized_weight_t->set_precision(PRECISION(kInt8));
+  // new op and relink nodes
+  auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
+  new_quantized_op->Attach(op_desc, scope);
+  auto* new_quantized_op_node =
+      graph->GraphCreateInstructNode(new_quantized_op, valid_places);
+  IR_NODE_LINK_TO(quantized_op_input, new_quantized_op_node);
+  IR_NODE_LINK_TO(quantized_op_weight, new_quantized_op_node);
+  IR_NODE_LINK_TO(new_quantized_op_node, dequant_op_out);
+}
+cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc;
+  return op_desc;
+}
 void DeleteQuantDequantOpFuser::BuildPattern() {
  std::string quant_dequant_op_type =
      "fake_quantize_dequantize_moving_average_abs_max";

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -24,18 +24,21 @@ namespace mir {
 namespace fusion {
 /* The model trained by fluid quantization is a simulation of real int8.
- * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop
+ * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quant op
- * in front and fake_dequantop behind.
+ * in front and fake_dequant op behind.
 *
- * When in int8 mode, the pattern like "fake_quant + quantized_op +
+ * For int8 mode, the pattern like "fake_quant + quantized_op + fake_dequant"
- * fake_dequant"
+ * can be processed by the following three fuser. The fuser extract the
- * can be detected by this fuser. The fuser extract the input_scale and
+ * input_scale and the weight_scale info from fake_quant, fake_dequant op and
- * the weight_scale info from fake_quant, fake_dequant op and fuse those into
+ * fuse those into the quantized_op.
- * the quantized_op.
 * In addition, the fuser delete fake_quant and fake_dequant op in the graph at
 * the last.
 */
+/* DeleteQuantOpFuser process
+ * fake_quantize_range_abs_max/fake_quantize_moving_average_abs_max
+ * + conv2d/mul/depthwise.
+*/
 class DeleteQuantOpFuser : public FuseBase {
 public:
  explicit DeleteQuantOpFuser(const std::string& quant_op_type)
@@ -50,9 +53,12 @@ class DeleteQuantOpFuser : public FuseBase {
  std::string quant_op_type_{};
 };
+/* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs.
+*/
 class DequantOpFuser : public FuseBase {
 public:
-  explicit DequantOpFuser(const std::string& op_type) : op_type_(op_type) {}
+  explicit DequantOpFuser(const std::string& quantized_op_type)
+      : quantized_op_type_(quantized_op_type) {}
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
@@ -60,7 +66,24 @@ class DequantOpFuser : public FuseBase {
  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
 private:
-  std::string op_type_{};
+  std::string quantized_op_type_{};
+};
+/* ChannelWiseDequantOpFuser process conv2d/depthwise_conv2d +
+ * fake_channel_wise_dequantize_max_abs.
+*/
+class ChannelWiseDequantOpFuser : public FuseBase {
+ public:
+  explicit ChannelWiseDequantOpFuser(const std::string& quantized_op_type)
+      : quantized_op_type_(quantized_op_type) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+ private:
+  std::string quantized_op_type_{};
 };
 /* The pattern like "fake_quantize_dequantize_moving_average_abs_max +

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass {
      const auto& place = places[i];
      float weight = static_cast<float>(place_size - i) / place_size;
      size_t score{};
      // The more important factor comes first
      if (kernel_pick_factors_.IsTargetConsidered() &&
          (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass {
    VLOG(4) << "[score(final)]:" << final_score;
    VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " "
+    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
-            << DataLayoutToStr(winner_place.layout) << " "
+            << " " << DataLayoutToStr(winner_place.layout) << " "
            << TargetToStr(winner_place.target);
    VLOG(4) << " ===> kernel.place():"
            << PrecisionToStr(kernel.place().precision) << " "
            << DataLayoutToStr(kernel.place().layout) << " "
            << TargetToStr(kernel.place().target);
    VLOG(4) << "kernel.op_type():" << kernel.op_type();
-    VLOG(4) << "picker tactic " << kernel_pick_factors_;
+    VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
-    VLOG(4) << "kernel place " << kernel.place().DebugString();
+    VLOG(4) << "kernel place:" << kernel.place().DebugString();
-    VLOG(4) << "picker place " << winner_place.DebugString();
+    VLOG(4) << "winner_picker place:" << winner_place.DebugString();
    VLOG(4) << "------------------------------";
    // The data layout is not considered, for the input and output arguments

--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
@@ -35,7 +35,7 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
    lite::mir::Node* var_node, const Scope* scope) {
  CHECK(var_node->IsArg());
  const auto& arg = var_node->AsArg();
-  VLOG(4) << "Convert var node " << arg.name;
+  VLOG(4) << "[NPU] Convert var node " << arg.name;
  auto* var = scope->FindVar(arg.name);
  CHECK(var);
@@ -44,13 +44,13 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
  auto dims = tensor->dims();
  if (arg.is_weight) {
    auto wgt = std::make_shared<ge::op::Const>(arg.name);
-    LOG(INFO) << "in convert const:" << arg.name;
+    LOG(INFO) << "[NPU] Convert const var node " << arg.name;
    VLOG(4) << dims;
-    wgt->set_attr_value(lite::npu::CvtFromLiteTensor(tensor));
+    wgt->set_attr_value(lite::npu::CvtTensor(tensor));
    return wgt;
  } else {
    CHECK_EQ(dims.size(), 4);
-    LOG(INFO) << "in convert data:" << arg.name;
+    LOG(INFO) << "[NPU] Convert data var node " << arg.name;
    LOG(INFO) << dims;
    // TODO(xxx): support more types and dims size
    ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
@@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph(
  // persistable=true, Sothat the model parser can recognize it and save it to
  // param files
  if (!lite::npu::BuildModel(inputs, outputs, weight)) {
-    LOG(WARNING) << "Build NPU failed subgraph " << sub_id;
+    LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("Build NPU failed subgraph.");
+    throw std::runtime_error("Build NPU graph failed.");
  }
-  LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id;
+  LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
  return weight_var_name;
 }
@@ -166,12 +166,12 @@ void GenerateNPUProgramPass::GenNPUSubgraph(
 }
 void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "Before NPU Pass \n" << Visualize(graph.get());
+  LOG(INFO) << "[NPU] Before NPU Pass \n" << Visualize(graph.get());
  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
  const auto& op_map = bridges.AllFunctions();
  std::vector<std::string> supported_op_types;
  for (auto& i : op_map) {
-    LOG(INFO) << "Supported type: " << i.first;
+    LOG(INFO) << "[NPU] Supported type: " << i.first;
    supported_op_types.push_back(i.first);
  }
@@ -182,15 +182,15 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    CHECK_EQ(op_nodes_all.size(), num_subgraph);
    int id = 1;
    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "Converting subgraph_id:" << id;
+      LOG(INFO) << "[NPU] Converting Subgraph " << id;
      GenNPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "After NPU Pass Subgraph " << id << "\n"
+      LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
                << Visualize(graph.get());
      id++;
    }
  } catch (...) {
-    LOG(WARNING) << "Build NPU graph failed";
+    LOG(WARNING) << "[NPU] Build NPU graph failed.";
-    throw std::runtime_error("Build NPU graph failed");
+    throw std::runtime_error("[NPU] Build NPU graph failed.");
  }
  for (auto& item : graph->StmtTopologicalOrder()) {
@@ -203,7 +203,7 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }
 std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
-  LOG(INFO) << "insts.size " << insts_.size();
+  LOG(INFO) << "[NPU] program insts.size " << insts_.size();
  std::unique_ptr<RuntimeProgram> program(
      new RuntimeProgram(std::move(insts_)));
  return program;

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
  for (auto& kernel : kernels) {
    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-#ifdef LITE_WITH_OPENCL
    // layout kernel choose
    //   must ignore [layout check] for layout of kernels's input and output
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
+    // note: replace LITE_WITH_OPENCL macro with judge input and output target
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
+    // of layout_trans
-        DeviceCompatibleTo(*in_arg_ty, from) &&
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
-        out_arg_ty->layout() == to.layout()) {
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
-#else
+        (TargetCompatibleTo(*in_arg_ty, from) &&
-    if (TypeCompatible(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
-        out_arg_ty->layout() == to.layout()) {
+         DeviceCompatibleTo(*in_arg_ty, from) &&
-#endif
+         out_arg_ty->layout() == to.layout())) {
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->layout() == to.layout()) {
      is_found = true;
+    }
+    if (is_found) {
      selected_kernels.emplace_back(std::move(kernel));
      // we pick the kernel
      layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
      break;
    }
  }
  CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
                  << in->AsArg().name << "->" << to << ":"
                  << inst_node->AsStmt().op_info()->Type();

--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
    VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
    VLOG(4) << "to:" << to << "\n";
-// kernel choose branch for opencl backend
+    // kernel choose branch for opencl backend
-//   judge inst's target whether is kOpenCL
+    //   judge inst's target whether is kOpenCL
-//   Note: to == *decl_arg_type == in of inst, not output of last inst
+    //   Note: to == *decl_arg_type == in of inst, not output of last inst
-#ifdef LITE_WITH_OPENCL
    // ignore [layout check] for layout between [to] and [from]
    //   Because all of origin opencl insts in model, are not default layout
    //   NCHW,
@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
    //     [*decl_arg_type] -> [to]: input of inst, not output of last
    //     [in_arg_ty]: in of io_copy
    //     [out_arg_ty]: out of io_copy
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
+    //
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
+    // noto: replace LITE_WITH_OPENCL macro with judge input and output target
-        DeviceCompatibleTo(*in_arg_ty, from) &&
+    // of io_copy
-        TargetCompatibleTo(*out_arg_ty, to)) {
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
-      VLOG(4) << "do nothing. opencl found";
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
-#else
+        (TargetCompatibleTo(*in_arg_ty, from) &&
-    if (TypeCompatible(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
-        out_arg_ty->target() == to.target()) {
+         DeviceCompatibleTo(*in_arg_ty, from) &&
-#endif
+         TargetCompatibleTo(*out_arg_ty, to))) {
+      VLOG(4) << "picked, opencl found";
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->target() == to.target()) {
      VLOG(4) << "picked";
      is_found = true;
+    }
+    if (is_found) {
      selected_kernels.emplace_back(std::move(kernel));
      // we pick the kernel
      io_copy_inst->AsStmt(
          io_copy_type, std::move(selected_kernels), io_copy_op);
      break;
    }
    VLOG(4) << "not picked";
  }
  CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
                  << ":" << in->AsArg().name << " -> " << to << ":"
                  << inst_node->AsStmt().op_info()->Type();

--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
    }
  }
-  // Set the tye of the weight
+  // Set the type of the weight
-  void SetWeightType(Node* w, const LiteType& type) {
+  void SetWeightType(Node* w,
-// TODO(xg) to optimize this
+                     const LiteType& type,
-#ifdef LITE_WITH_FPGA
+                     const std::map<std::string, bool>& lite_with_targets) {
-    w->AsArg().type = LiteType::GetTensorTy(
+    VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    if (lite_with_targets.at("kFPGA")) {
-#endif
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#ifdef LITE_WITH_OPENCL
+    } else if (lite_with_targets.at("kOpenCL")) {
-    w->AsArg().type = LiteType::GetTensorTy(
+      w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
+    } else {
+      w->AsArg().type = LiteType::GetTensorTy(
-#ifndef LITE_WITH_FPGA
+          TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
-#ifndef LITE_WITH_OPENCL
+    }
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
-#endif
-#endif
  }
  void InferenceArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
    for (auto& x : graph->StmtTopologicalOrder()) {
      auto& inst = x->AsStmt();
-// The IoCopyOp is a tool operator, it won't support the type inference.
+      // The IoCopyOp is a tool operator, it won't support the type inference.
-// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
-// tool operator
+      // for
-#ifndef LITE_WITH_FPGA
+      // tool operator
-#ifndef LITE_WITH_OPENCL
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
-      VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
-      if (inst.op_type() == "io_copy") continue;
+        if (inst.op_type() == "io_copy") continue;
-#endif
+      }
-#endif
      // deal with inputs
      VLOG(4) << "Infering op " << inst.op_info()->Repr();
      // TODO(zhaolong): Add check if the node's name in op's arguments.
@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
        if (!x_in->AsArg().type) {
          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type);
+            SetWeightType(x_in, *type, lite_with_targets);
          } else {
            x_in->AsArg().type = type;
          }
@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
        if (!x_out->AsArg().type) {
          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type);
+            SetWeightType(x_out, *type, lite_with_targets);
          } else {
            x_out->AsArg().type = type;
          }

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -49,6 +50,22 @@ class Optimizer {
    valid_places_ = valid_places;
    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
    CHECK(!graph_) << "duplicate optimize found";
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kNPU", valid_places_has_target(TARGET(kNPU))},
+        {"kXPU", valid_places_has_target(TARGET(kXPU))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"];
+    VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"];
    graph_.reset(new mir::SSAGraph);
    graph_->Build(program, valid_places);
    graph_->SetValidPlaces(valid_places);
@@ -57,14 +74,11 @@ class Optimizer {
    InitTargetTypeTransformPass();
    if (passes.empty()) {
-      RunPasses(std::vector<std::string>{
+      std::vector<std::string> passes_local{
          {"lite_quant_dequant_fuse_pass",     //
           "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
           "lite_conv_bn_fuse_pass",           //
           "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
-           // This pass is disabled to force some opencl kernels selected for
-           // final running, otherwise, they will be fused to ARM fusion
-           // kernels, and the OpenCL devices will be discarded.
           // TODO(Superjomn) Refine the fusion related design to select fusion
           // kernels for devices automatically.
           "lite_conv_activation_fuse_pass",              //
@@ -105,16 +119,17 @@ class Optimizer {
           "argument_type_display_pass",  //
           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
+           "argument_type_display_pass",
           "runtime_context_assign_pass",
-           "argument_type_display_pass",  //
+           "argument_type_display_pass"}};
-#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
+      if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) &&
-    !defined(LITE_WITH_XPU)
+          (!lite_with_targets["kXPU"])) {
-           // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
+        // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL
-           "memory_optimize_pass",
+        // kernel
-#endif
+        passes_local.emplace_back("memory_optimize_pass");
-           "argument_type_display_pass"}});
+      }
+      RunPasses(passes_local);
    } else {
      RunPasses(passes);
    }
@@ -141,6 +156,7 @@ class Optimizer {
                      .LookUp<mir::subgraph::GenerateNPUProgramPass>(
                          "generate_npu_program_pass");
 #endif
 #ifdef LITE_WITH_XPU
      auto pass = mir::PassManager::Global()
                      .LookUp<mir::subgraph::GenerateXPUProgramPass>(

--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) {
  return res;
 }
+// 0. Enable OpenCL, if needed
+// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
+// #define DEMO_WITH_OPENCL
 void RunModel() {
  // 1. Set CxxConfig
  CxxConfig config;
  config.set_model_dir(FLAGS_model_dir);
+#ifdef DEMO_WITH_OPENCL
+  std::vector<Place> valid_places{
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kARM), PRECISION(kFloat)}};
+#else
  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+#endif
  if (FLAGS_prefer_int8_kernel) {
    valid_places.insert(valid_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -100,5 +100,9 @@ lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_
 lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
 lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
-lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
 lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
+if(LITE_BUILD_EXTRA)
+    lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
+    lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)
+endif()
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -49,10 +49,7 @@ void CastCompute::Run() {
    const int32_t* x_data_begin = param.X->data<int32_t>();
    const int32_t* x_data_end = x_data_begin + param.X->numel();
    float* out_data = param.Out->mutable_data<float>();
-    // std::transform(x_data_begin, x_data_end, out_data, TransOp<int32_t,
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<int32_t, float>);
-    // float>);
-    // todo: the input type actually is float.
-    memcpy(out_data, x_data_begin, sizeof(float) * param.X->numel());
  } else if (param.in_dtype == 20 && param.out_dtype == 5) {  // uint8->float32
    const unsigned char* x_data_begin = param.X->data<unsigned char>();
    const unsigned char* x_data_end = x_data_begin + param.X->numel();

--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -39,6 +39,11 @@ void ConcatCompute::Run() {
  std::vector<lite::Tensor*> inputs = param.x;
  auto* out = param.output;
  int axis = param.axis;
+  auto* axis_tensor = param.axis_tensor;
+  if (axis_tensor != nullptr) {
+    auto* axis_tensor_data = axis_tensor->data<int>();
+    axis = axis_tensor_data[0];
+  }
  out->mutable_data<float>();
  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
@@ -83,5 +88,7 @@ void ConcatCompute::Run() {
 REGISTER_LITE_KERNEL(
    concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/arm/conv_direct.cc
+++ b/lite/kernels/arm/conv_direct.cc
@@ -20,15 +20,10 @@ namespace kernels {
 namespace arm {
 template <>
-void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
+void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
-  auto& param = this->template Param<param_t>();
+  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  if (last_shape_ == x_dims) {
-    return;
-  }
  auto& ctx = this->ctx_->template As<ARMContext>();
+  // extend workspace
  if (param.strides[0] == 2) {
    ctx.ExtendWorkspace(
        lite::arm::math::conv3x3s2_direct_workspace_size(param, &ctx));
@@ -36,12 +31,7 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
    ctx.ExtendWorkspace(
        lite::arm::math::conv3x3s1_direct_workspace_size(param, &ctx));
  }
-}
-template <>
-void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
  const auto* i_data = param.x->data<float>();
  const auto* w_data = weights_.data<float>();
  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
@@ -89,9 +79,6 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  }
 }
-template <>
-void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::ReInitWhenNeeded() {}
 template <>
 void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -148,9 +135,6 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  }
 }
-template <>
-void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::ReInitWhenNeeded() {}
 template <>
 void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();

--- a/lite/kernels/arm/conv_direct.h
+++ b/lite/kernels/arm/conv_direct.h
@@ -156,7 +156,6 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {
    auto x_dims = param.x->dims();
    auto w_dims = param.filter->dims();
    auto o_dims = param.output->dims();
-    last_shape_ = x_dims;
    int ic = x_dims[1];
    int oc = o_dims[1];
@@ -179,12 +178,10 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {
                                                  w_scale_);
  }
-  virtual void ReInitWhenNeeded();
  virtual void Run();
  /// todo, support inplace weights transform
 protected:
-  DDim last_shape_;
  Tensor weights_;
  Tensor bias_;
  bool flag_trans_weights_{false};

--- a/lite/kernels/arm/conv_gemmlike.cc
+++ b/lite/kernels/arm/conv_gemmlike.cc
@@ -85,6 +85,7 @@ template <>
 void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
  auto weights = param.filter->data<float>();
  if (flag_trans_weights_) {
    weights = weights_.data<float>();
@@ -120,6 +121,7 @@ template <>
 void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
  auto weights = param.filter->data<int8_t>();
  if (flag_trans_weights_) {
    weights = weights_.data<int8_t>();
@@ -179,6 +181,7 @@ template <>
 void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
  auto weights = param.filter->data<int8_t>();
  if (flag_trans_weights_) {
    weights = weights_.data<int8_t>();

--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
@@ -72,7 +72,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
    } else {
      //! im2col gemmlike conv
      flag_1x1gemm_ = false;
-      ctx.ExtendWorkspace(k * n * sizeof(float));
+      workspace_size_ = k * n * sizeof(float);
    }
    if (!flag_trans_weights_ && n > 1) {
      lite::arm::math::trans_gemm_weights<Ptype>(
@@ -97,6 +97,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
  bool flag_trans_bias_{false};
  Tensor weights_;
  Tensor bias_;
+  int workspace_size_{0};
 };
 }  // namespace arm

--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
@@ -40,13 +40,13 @@ void Conv2DTransposeCompute::PrepareForRun() {
  int group = param.groups;
  // deconv weights layout: chin * chout * kh * kw
-  auto& ctx = this->ctx_->template As<ARMContext>();
  int m = chout * kw * kh / group;
  int n = hin * win;
  int k = chin / group;
-  ctx.ExtendWorkspace(group * m * n * sizeof(float));
+  workspace_size_ = group * m * n * sizeof(float);
+  auto& ctx = this->ctx_->template As<ARMContext>();
  lite::Tensor tmp_weights;
  lite::arm::math::prepackA(
      &tmp_weights, *(param.filter), 1.f, m, k, group, true, &ctx);
@@ -57,6 +57,8 @@ void Conv2DTransposeCompute::PrepareForRun() {
 }
 void Conv2DTransposeCompute::Run() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
  auto& param = this->Param<param_t>();
  auto x_dims = param.x->dims();
  auto o_dims = param.output->dims();
@@ -80,7 +82,6 @@ void Conv2DTransposeCompute::Run() {
  int group_size_in = win * hin * chin / group;
  int group_size_out = wout * hout * chout / group;
  int group_size_coldata = m * n;
-  auto& ctx = this->ctx_->template As<ARMContext>();
  int hblock = lite::arm::math::get_hblock(&ctx);
  int m_roundup = hblock * ((m + hblock - 1) / hblock);
  int group_size_weights = ((m_roundup * k + 15) / 16) * 16;

--- a/lite/kernels/arm/conv_transpose_compute.h
+++ b/lite/kernels/arm/conv_transpose_compute.h
@@ -32,6 +32,9 @@ class Conv2DTransposeCompute
  void Run() override;
  ~Conv2DTransposeCompute() = default;
+ protected:
+  int workspace_size_{0};
 };
 }  // namespace arm

--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -46,8 +46,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
  int max_ch = ic > oc ? ic : oc;
  const int n_wino = size_tile;
-  ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
+  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
-                      sizeof(float));
  last_shape_ = x_dims;
 }
@@ -76,8 +75,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  int hblock = lite::arm::math::get_hblock(&ctx);
  int m_round = hblock * ((m_wino + hblock - 1) / hblock);
  weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
-  ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
+  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
-                      sizeof(float));
  auto weights_wino =
      static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
  void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
@@ -106,6 +104,9 @@ template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->template As<ARMContext>();
+  // extend workspace
+  ctx.ExtendWorkspace(workspace_size_);
  const auto* i_data = param.x->data<float>();
  const auto* w_data = weights_.data<float>();
  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;

--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -39,6 +39,7 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
  using param_t = operators::ConvParam;
  Tensor weights_;
  DDim last_shape_;
+  int workspace_size_{0};
 };
 }  // namespace arm

--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -127,7 +127,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                             k_,
                             param.bias != nullptr,
                             b_data,
-                             false);
+                             false,
+                             &ctx);
    }
  }
 }

--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -29,9 +29,25 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<ARMContext>();
-    auto data = param.Out->template mutable_data<T>();
+    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-    for (int i = 0; i < param.Out->numel(); i++) {
+      auto data = param.Out->template mutable_data<float>();
-      data[i] = param.value;
+      for (int i = 0; i < param.Out->numel(); i++) {
+        data[i] = param.value;
+      }
+    } else if (param.dtype ==
+               static_cast<int32_t>(lite::core::FluidType::INT32)) {
+      auto data = param.Out->template mutable_data<int32_t>();
+      for (int i = 0; i < param.Out->numel(); i++) {
+        data[i] = param.value;
+      }
+    } else if (param.dtype ==
+               static_cast<int32_t>(lite::core::FluidType::INT8)) {
+      auto data = param.Out->template mutable_data<int8_t>();
+      for (int i = 0; i < param.Out->numel(); i++) {
+        data[i] = param.value;
+      }
+    } else {
+      LOG(FATAL) << "not supported dtype " << param.dtype;
    }
  }
@@ -54,9 +70,25 @@ class FillConstantBatchLikeCompute
      param.out->Resize(odims);
    }
-    auto data = param.out->template mutable_data<T>();
+    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-    for (int i = 0; i < param.out->numel(); i++) {
+      auto data = param.out->template mutable_data<float>();
-      data[i] = param.value;
+      for (int i = 0; i < param.out->numel(); i++) {
+        data[i] = param.value;
+      }
+    } else if (param.dtype ==
+               static_cast<int32_t>(lite::core::FluidType::INT32)) {
+      auto data = param.out->template mutable_data<int32_t>();
+      for (int i = 0; i < param.out->numel(); i++) {
+        data[i] = param.value;
+      }
+    } else if (param.dtype ==
+               static_cast<int32_t>(lite::core::FluidType::INT8)) {
+      auto data = param.out->template mutable_data<int8_t>();
+      for (int i = 0; i < param.out->numel(); i++) {
+        data[i] = param.value;
+      }
+    } else {
+      LOG(FATAL) << "not supported dtype " << param.dtype;
    }
  }

--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
@@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() {
  auto& param = Param<operators::InterpolateParam>();
  lite::Tensor* X = param.X;
  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
  lite::Tensor* Out = param.Out;
  float scale = param.scale;
  int out_w = param.out_w;
@@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() {
  std::string interp_method = "Bilinear";
  lite::arm::math::interpolate(X,
                               OutSize,
+                               SizeTensor,
+                               Scale,
                               Out,
                               out_h,
                               out_w,
                               scale,
-                               scale,
                               align_corners,
                               interp_method);
 }
@@ -49,6 +52,8 @@ void NearestInterpCompute::Run() {
  auto& param = Param<operators::InterpolateParam>();
  lite::Tensor* X = param.X;
  lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
  lite::Tensor* Out = param.Out;
  float scale = param.scale;
  int out_w = param.out_w;
@@ -57,11 +62,12 @@ void NearestInterpCompute::Run() {
  std::string interp_method = "Nearest";
  lite::arm::math::interpolate(X,
                               OutSize,
+                               SizeTensor,
+                               Scale,
                               Out,
                               out_h,
                               out_w,
                               scale,
-                               scale,
                               align_corners,
                               interp_method);
 }
@@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
@@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,7 +28,6 @@ namespace arm {
 void LookupTableCompute::Run() {
  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
  // inputs
  auto w = param.W;
  auto ids = param.Ids;
@@ -37,7 +36,7 @@ void LookupTableCompute::Run() {
  auto table_dim = w->dims();
  int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<float>();
+  auto ids_data = ids->data<int64_t>();
  int64_t row_number = table_dim[0];
  int64_t row_width = table_dim[1];
@@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table,
    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/lookup_table_compute_test.cc
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/arm/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+void lookup_table_compute_ref(const operators::LookupTableParam &param) {
+  auto *ids_t = param.Ids;
+  auto *output_t = param.Out;
+  int64_t padding_idx = param.padding_idx;
+  auto *ids = ids_t->data<int64_t>();
+  int64_t ids_numel = ids_t->dims().production();
+  auto *table_t = param.W;
+  int64_t row_number = table_t->dims()[0];
+  int64_t row_width = table_t->dims()[1];
+  auto *table = table_t->data<float>();
+  auto *output = output_t->mutable_data<float>();
+  memset(output, 0, output_t->dims().production() * sizeof(float));
+  for (int64_t i = 0; i < ids_numel; ++i) {
+    if (padding_idx != -1 && ids[i] == padding_idx) {
+      memset(output + i * row_width, 0, row_width * sizeof(float));
+    } else {
+      CHECK_LT(ids[i], row_number);
+      CHECK_GE(ids[i], 0);
+      memcpy(output + i * row_width,
+             table + ids[i] * row_width,
+             row_width * sizeof(float));
+    }
+  }
+}
+TEST(lookup_table_arm, retrieve_op) {
+  auto lookup_table =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "lookup_table");
+  ASSERT_FALSE(lookup_table.empty());
+  ASSERT_TRUE(lookup_table.front());
+}
+TEST(lookup_table_arm, init) {
+  LookupTableCompute lookup_table;
+  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
+  ASSERT_EQ(lookup_table.target(), TARGET(kARM));
+}
+TEST(lookup_table_arm, compute) {
+  LookupTableCompute lookup_table;
+  operators::LookupTableParam param;
+  lite::Tensor w, ids, out, out_ref;
+  int64_t padding_idx = -1;
+  auto w_dim = DDim(std::vector<int64_t>({4, 5}));
+  auto ids_dim = DDim(std::vector<int64_t>({3, 2}));
+  auto out_dim = DDim(std::vector<int64_t>({3, 2, 5}));
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+  auto *w_data = w.mutable_data<float>();
+  auto *ids_data = ids.mutable_data<int64_t>();
+  auto *out_data = out.mutable_data<float>();
+  auto *out_ref_data = out_ref.mutable_data<float>();
+  int w_num = w_dim.production();
+  for (int i = 0; i < w_num; i++) {
+    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  int ids_num = ids_dim.production();
+  for (int i = 0; i < ids_num; i++) {
+    ids_data[i] = i % 4;
+  }
+  int out_num = out_dim.production();
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  lookup_table.SetParam(param);
+  lookup_table.Run();
+  param.Out = &out_ref;
+  lookup_table_compute_ref(param);
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
--- a/lite/kernels/arm/lrn_compute.cc
+++ b/lite/kernels/arm/lrn_compute.cc
@@ -31,16 +31,16 @@ void LrnCompute::Run() {
  int channel = x_dims[1];
  int h = x_dims[2];
  int w = x_dims[3];
-  const int local_size = param.local_size;
+  const int n = param.n;
  const float alpha = param.alpha;
  const float beta = param.beta;
  const float k = param.k;
  if (param.norm_region == "AcrossChannels") {
    lite::arm::math::compute_across_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
+        x_data, out_data, num, channel, h, w, n, alpha, beta, k);
  } else {
    lite::arm::math::compute_within_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
+        x_data, out_data, num, channel, h, w, n, alpha, beta, k);
  }
 }
@@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL(
    lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
@@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) {
  const dtype* x_data = param.X->data<const dtype>();
  dtype* out_data = param.Out->mutable_data<dtype>();
  auto x_dims = param.X->dims();
-  int local_size = param.local_size;
+  int local_size = param.n;
  float alpha = param.alpha;
  float beta = param.beta;
  float k = param.k;
@@ -171,7 +171,7 @@ TEST(lrn_arm, compute) {
          }
          param.X = &x;
          param.Out = &output;
-          param.local_size = local_size;
+          param.n = local_size;
          param.alpha = alpha;
          param.beta = beta;
          param.k = k;

--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
@@ -232,7 +232,7 @@ void MatMulCompute::Run() {
      int ldc = n_;
      if (n_ == 1) {
        lite::arm::math::sgemv(
-            x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+            x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
        if (fabsf(alpha - 1.f) > 1e-8f) {
          for (size_t i = 0; i < param.Out->dims().production(); ++i) {
            o_data[i] *= alpha;

--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
@@ -48,14 +48,13 @@ void MulCompute::Run() {
  CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
  k_ = x_w;
+  auto& ctx = this->ctx_->template As<ARMContext>();
  if (n_ == 1) {
    lite::arm::math::sgemv(
-        x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+        x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
  } else {
    constexpr bool is_tranposed_y = false;
-    auto& ctx = this->ctx_->template As<ARMContext>();
    int hblock = lite::arm::math::get_hblock(&ctx);
    int m_round = hblock * ((m_ + hblock - 1) / hblock);
    ctx.ExtendWorkspace(m_round * k_ * sizeof(float));

--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -66,7 +66,6 @@ void PoolCompute::Run() {
                                          in_dims[1],
                                          in_dims[2],
                                          in_dims[3]);
-      VLOG(3) << "invoking pooling_global_max";
      return;
    } else if (pooling_type == "avg") {
      lite::arm::math::pooling_global_avg(din,
@@ -78,7 +77,6 @@ void PoolCompute::Run() {
                                          in_dims[1],
                                          in_dims[2],
                                          in_dims[3]);
-      VLOG(3) << "invoking pooling_global_ave";
      return;
    }
  } else {
@@ -93,7 +91,6 @@ void PoolCompute::Run() {
                                          in_dims[1],
                                          in_dims[2],
                                          in_dims[3]);
-        VLOG(3) << "invoking pooling2x2s2_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling2x2s2_avg(din,
@@ -106,7 +103,6 @@ void PoolCompute::Run() {
                                          in_dims[2],
                                          in_dims[3],
                                          exclusive);
-        VLOG(3) << "invoking pooling2x2s2_avg";
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
@@ -121,7 +117,6 @@ void PoolCompute::Run() {
                                            in_dims[1],
                                            in_dims[2],
                                            in_dims[3]);
-        VLOG(3) << "invokingpooling3x3s1p1_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling3x3s1p1_avg(din,
@@ -134,7 +129,32 @@ void PoolCompute::Run() {
                                            in_dims[2],
                                            in_dims[3],
                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s1p1_avg";
+        return;
+      }
+    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
+               kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling3x3s1p0_max(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3]);
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling3x3s1p0_avg(din,
+                                            dout,
+                                            out_dims[0],
+                                            out_dims[1],
+                                            out_dims[2],
+                                            out_dims[3],
+                                            in_dims[1],
+                                            in_dims[2],
+                                            in_dims[3],
+                                            exclusive);
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
@@ -149,7 +169,6 @@ void PoolCompute::Run() {
                                            in_dims[1],
                                            in_dims[2],
                                            in_dims[3]);
-        VLOG(3) << "pooling3x3s2p0_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling3x3s2p0_avg(din,
@@ -162,7 +181,6 @@ void PoolCompute::Run() {
                                            in_dims[2],
                                            in_dims[3],
                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s2p0_avg";
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
@@ -177,7 +195,6 @@ void PoolCompute::Run() {
                                            in_dims[1],
                                            in_dims[2],
                                            in_dims[3]);
-        VLOG(3) << "invoking pooling3x3s2p1_max";
        return;
      } else if (pooling_type == "avg") {
        lite::arm::math::pooling3x3s2p1_avg(din,
@@ -190,7 +207,6 @@ void PoolCompute::Run() {
                                            in_dims[2],
                                            in_dims[3],
                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s2p1_avg";
        return;
      }
    }
@@ -213,7 +229,6 @@ void PoolCompute::Run() {
                                 ceil_mode,
                                 use_quantizer,
                                 pooling_type);
-  VLOG(3) << "invoking pooling_basic";
 }
 }  // namespace arm

--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -68,8 +68,8 @@ void pool_compute_ref(const operators::PoolParam& param) {
  auto& in_dims = param.x->dims();
  auto& out_dims = param.output->dims();
-  const float* src_ptr = param.x->data<const float>();
+  const float* din = param.x->data<const float>();
-  float* dst_ptr = param.output->mutable_data<float>();
+  float* dout = param.output->mutable_data<float>();
  std::vector<int> ksize = param.ksize;
  std::vector<int> strides = param.strides;
@@ -83,84 +83,120 @@ void pool_compute_ref(const operators::PoolParam& param) {
  bool use_quantizer = param.use_quantizer;
  std::string data_format = param.data_format;
-  int in_n = in_dims[0];
+  int num = in_dims[0];
-  int in_c = in_dims[1];
+  int chin = in_dims[1];
-  int in_h = in_dims[2];
+  int hin = in_dims[2];
-  int in_w = in_dims[3];
+  int win = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-  int out_h = out_dims[2];
+  int chout = out_dims[1];
-  int out_w = out_dims[3];
+  int hout = out_dims[2];
-  int size_out_n = in_c * out_h * out_w;
+  int wout = out_dims[3];
-  int size_out_c = out_h * out_w;
-  int window_h = ksize[0];
+  // no need to pad input tensor, border is zero pad inside this function
-  int window_w = ksize[1];
+  memset(dout, 0, num * chout * hout * wout * sizeof(float));
+  int kernel_h = ksize[0];
+  int kernel_w = ksize[1];
  int stride_h = strides[0];
  int stride_w = strides[1];
  int pad_h = paddings[0];
  int pad_w = paddings[1];
+  int size_channel_in = win * hin;
-  if (global_pooling == true) {
+  int size_channel_out = wout * hout;
-    for (int n = 0; n < in_n; ++n) {
+  if (global_pooling) {
-      for (int c = 0; c < in_c; ++c) {
+    if (pooling_type == "max") {  // Pooling_max
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
+      for (int n = 0; n < num; ++n) {
-        float res = src[0];
+        float* dout_batch = dout + n * chout * size_channel_out;
-        if (pooling_type == "max") {
+        const float* din_batch = din + n * chin * size_channel_in;
-          for (int i = 1; i < size_in_c; ++i) {
+#pragma omp parallel for
-            float cur_val = src[i];
+        for (int c = 0; c < chout; ++c) {
-            res = cur_val > res ? cur_val : res;
+          const float* din_ch = din_batch + c * size_channel_in;  // in address
+          float tmp1 = din_ch[0];
+          for (int i = 0; i < size_channel_in; ++i) {
+            float tmp2 = din_ch[i];
+            tmp1 = tmp1 > tmp2 ? tmp1 : tmp2;
          }
-        } else if (pooling_type == "avg") {
+          dout_batch[c] = tmp1;
-          for (int i = 1; i < size_in_c; ++i) {
+        }
-            float cur_val = src[i];
+      }
-            res += cur_val;
+    } else if (pooling_type == "avg") {
+      // Pooling_average_include_padding
+      // Pooling_average_exclude_padding
+      for (int n = 0; n < num; ++n) {
+        float* dout_batch = dout + n * chout * size_channel_out;
+        const float* din_batch = din + n * chin * size_channel_in;
+#pragma omp parallel for
+        for (int c = 0; c < chout; ++c) {
+          const float* din_ch = din_batch + c * size_channel_in;  // in address
+          float sum = 0.f;
+          for (int i = 0; i < size_channel_in; ++i) {
+            sum += din_ch[i];
          }
-          res /= size_in_c;
+          dout_batch[c] = sum / size_channel_in;
        }
-        dst_ptr[n * size_out_n + c] = res;
      }
+    } else {
+      LOG(FATAL) << "unsupported pooling type: " << pooling_type;
    }
  } else {
-    for (int n = 0; n < in_n; ++n) {
+    for (int ind_n = 0; ind_n < num; ++ind_n) {
-      for (int c = 0; c < in_c; ++c) {
+#pragma omp parallel for
-        for (int h = 0; h < out_h; ++h) {
+      for (int ind_c = 0; ind_c < chin; ++ind_c) {
-          int sh = h * stride_h;
+        for (int ind_h = 0; ind_h < hout; ++ind_h) {
-          int eh = sh + window_h;
+          int sh = ind_h * stride_h;
+          int eh = sh + kernel_h;
          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
+          eh = (eh - pad_h) > hin ? hin : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
+          for (int ind_w = 0; ind_w < wout; ++ind_w) {
-            int sw = w * stride_w;
+            int sw = ind_w * stride_w;
-            int ew = sw + window_w;
+            int ew = sw + kernel_w;
            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
+            ew = (ew - pad_w) > win ? win : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
+            float result = static_cast<float>(0);
-            if (pooling_size == 0) continue;
+            int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
-            float res = 0.f;
+                          ind_h * wout + ind_w;
            for (int kh = sh; kh < eh; ++kh) {
              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                int src_ind =
+                    (ind_n * chin + ind_c) * size_channel_in + kh * win + kw;
                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
+                  result = din[src_ind];
                } else {
                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                    result = result >= din[src_ind] ? result : din[src_ind];
-                  }
+                  } else if (pooling_type == "avg") {
-                  if (pooling_type == "avg") {
+                    result += din[src_ind];
-                    res += src_ptr[src_idx];
                  }
                }
              }
            }
            if (pooling_type == "avg") {
              if (exclusive) {
-                res /= pooling_size;
+                int div = (ew - sw) * (eh - sh);
+                div = div > 0 ? div : 1;
+                result /= div;
              } else {
-                res /= window_h * window_w;
+                int bh = kernel_h;
+                int bw = kernel_w;
+                if (ew == win) {
+                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
+                                                    : sw + kernel_w;
+                  bw -= sw;
+                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                    bw += pad_w;
+                  }
+                }
+                if (eh == hin) {
+                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
+                                                    : sh + kernel_h;
+                  bh -= sh;
+                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                    bh += pad_h;
+                  }
+                }
+                result /= bh * bw;
              }
            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+            dout[dst_ind] = result;
          }
        }
      }

--- a/lite/kernels/arm/slice_compute.cc
+++ b/lite/kernels/arm/slice_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "lite/kernels/arm/slice_compute.h"
+#include <algorithm>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
@@ -20,22 +21,145 @@ namespace lite {
 namespace kernels {
 namespace arm {
+inline std::vector<int32_t> get_new_data_from_tensorlist(
+    const std::vector<lite::Tensor*>& list_new_data_tensor) {
+  // get tensor
+  std::vector<int32_t> vec_new_data;
+  for (size_t i = 0; i < list_new_data_tensor.size(); ++i) {
+    auto tensor = list_new_data_tensor[i];
+    CHECK_EQ(tensor->dims(), DDim({1})) << "shape of dim tensor should be [1]";
+    vec_new_data.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+  return vec_new_data;
+}
+inline std::vector<int32_t> get_new_data_from_tensor(
+    const lite::Tensor* new_data_tensor) {
+  std::vector<int32_t> vec_new_data;
+  auto* new_data = new_data_tensor->data<int32_t>();
+  vec_new_data =
+      std::vector<int32_t>(new_data, new_data + new_data_tensor->numel());
+  return vec_new_data;
+}
 void SliceCompute::PrepareForRun() {}
 void SliceCompute::Run() {
  auto& ctx = this->ctx_->template As<ARMContext>();
  auto& param = this->Param<operators::SliceParam>();
-  auto input_dims = param.X->dims();
+  auto in = param.X;
-  int dim_size = param.X->dims().size();
+  auto in_dims = in->dims();
+  auto out = param.Out;
+  auto out_dims = out->dims();
-  std::vector<int> starts = param.starts;
-  std::vector<int> ends = param.ends;
  std::vector<int> axes = param.axes;
-  const auto* x_data = param.X->data<int>();
+  std::vector<int32_t> starts = param.starts;
-  auto* o_data = param.Out->mutable_data<int>();
+  std::vector<int32_t> ends = param.ends;
+  std::vector<int> decrease_axis = param.decrease_axis;
+  std::vector<int> infer_flags = param.infer_flags;
+  auto list_new_ends_tensor = param.EndsTensorList;
+  auto list_new_starts_tensor = param.StartsTensorList;
+  bool need_infer = false;
+  if (param.StartsTensor || param.EndsTensor) {
+    need_infer = true;
+  }
+  if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) {
+    need_infer = true;
+  }
+  if (need_infer) {
+    if (param.StartsTensor) {
+      starts = get_new_data_from_tensor(param.StartsTensor);
+    } else if (list_new_starts_tensor.size() > 0) {
+      starts = get_new_data_from_tensorlist(list_new_starts_tensor);
+    }
+    CHECK_EQ(starts.size(), axes.size())
+        << "The size of starts must be equal to the size of axes.";
+    if (param.EndsTensor) {
+      ends = get_new_data_from_tensor(param.EndsTensor);
+    } else if (list_new_ends_tensor.size() > 0) {
+      ends = get_new_data_from_tensorlist(list_new_ends_tensor);
+    }
+    CHECK_EQ(ends.size(), axes.size())
+        << "The size of ends must be equal to the size of axes.";
+    out_dims = in_dims;
+    int dim_value, start, end;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = out_dims[axes[i]];
+      if (dim_value > 0) {
+        // when end = start+1 and start == -1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = 10000000;
+          }
+        }
+        start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+        end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+        start = std::max(start, 0);
+        end = std::max(end, 0);
+        end = std::min(end, dim_value);
+        CHECK_GT(end, start) << "end should greater than start";
+        out_dims[axes[i]] = end - start;
+      }
+    }
+    out->Resize(out_dims);
+    // generate new shape
+    if (decrease_axis.size() > 0) {
+      std::vector<int64_t> new_out_shape;
+      for (size_t i = 0; i < decrease_axis.size(); ++i) {
+        CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1";
+        out_dims[decrease_axis[i]] = 0;
+      }
+      for (int i = 0; i < out_dims.size(); ++i) {
+        if (out_dims[i] != 0) {
+          new_out_shape.push_back(out_dims[i]);
+        }
+      }
+      if (new_out_shape.size() == 0) {
+        new_out_shape.push_back(1);
+      }
+      DDim new_dims;
+      new_dims.ConstructFrom(new_out_shape);
+      out_dims = new_dims;
+    }
+  }
+  // resize out dims
+  if (decrease_axis.size() > 0) {
+    if (decrease_axis.size() == (size_t)in_dims.size()) {
+      std::vector<int64_t> vec_origin_out_shape(decrease_axis.size(), 1);
+      out->Resize(DDim(vec_origin_out_shape));
+    } else {
+      std::vector<int64_t> vec_origin_out_shape(
+          out_dims.size() + decrease_axis.size(), -1);
+      for (size_t i = 0; i < decrease_axis.size(); ++i) {
+        vec_origin_out_shape[decrease_axis[i]] = 1;
+      }
+      int index = 0;
+      for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
+        if (vec_origin_out_shape[i] == -1) {
+          vec_origin_out_shape[i] = out_dims[index];
+          ++index;
+        }
+      }
+      out->Resize(DDim(vec_origin_out_shape));
+    }
+  }
+  auto new_out_dims = out->dims();
+  const auto* x_data = in->data<int>();
+  auto* o_data = out->mutable_data<int>();
  lite::arm::math::slice(
-      x_data, input_dims.data(), axes, starts, ends, o_data, &ctx);
+      x_data, in_dims.data(), axes, starts, ends, o_data, &ctx);
 }
 }  // namespace arm
@@ -46,12 +170,9 @@ void SliceCompute::Run() {
 REGISTER_LITE_KERNEL(
    slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("StartsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
-// REGISTER_LITE_KERNEL(
-//    slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), Precision(kINT32))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM),
-//    Precision(kINT32))})
-//    .Finalize();
--- a/lite/kernels/arm/unsqueeze_compute.cc
+++ b/lite/kernels/arm/unsqueeze_compute.cc
@@ -55,6 +55,10 @@ REGISTER_LITE_KERNEL(unsqueeze,
                     paddle::lite::kernels::host::UnsqueezeCompute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("AxesTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("AxesTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
@@ -65,6 +69,10 @@ REGISTER_LITE_KERNEL(unsqueeze2,
                     paddle::lite::kernels::host::Unsqueeze2Compute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("AxesTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("AxesTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -9,6 +9,7 @@ add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_k
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
 add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
@@ -22,6 +23,16 @@ add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_k
 add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_seq_depadding_compute_cuda CUDA basic SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(match_matrix_tensor_compute_cuda CUDA basic SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
+add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
 nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
@@ -32,8 +43,22 @@ nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_c
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
 nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
 nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
+nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
 nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
+nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
+nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
+nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
+nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
+nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
+nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
+nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
+if(LITE_BUILD_EXTRA)
+    nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
+    nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
+    nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
+endif()
--- a/lite/kernels/cuda/attention_padding_mask_compute.cu
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/attention_padding_mask_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+#define CUDA_NUM_THREADS 256
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__global__ void ker_attention_padding_mask(T* out_data,
+                                           const T* attn_data,
+                                           const int* src_offset,
+                                           const int attn_seq_num,
+                                           const int attn_seq_len,
+                                           const int src_seq_num,
+                                           const int src_seq_len,
+                                           const T mask,
+                                           const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int src_word_id = tid % src_seq_len;
+    int tmp_tid = tid / src_seq_len;
+    int attn_seq_id = tmp_tid / attn_seq_len;
+    int attn_word_id = tmp_tid % attn_seq_len;
+    int src_seq_id = attn_seq_id % src_seq_num;
+    int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id];
+    if (src_word_id >= cur_len) {
+      out_data[tid] = mask;
+    } else {
+      out_data[tid] = attn_data[tid];
+    }
+  }
+}
+void AttentionPaddingMaskCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  auto attn = param.X;
+  auto src = param.Y;
+  const int count = attn->numel();
+  auto attn_offset = attn->lod()[0];
+  auto src_offset = src->lod()[0];
+  const int attn_seq_num = attn_offset.size() - 1;
+  const int attn_seq_len = attn_offset[1];
+  const int src_seq_num = src_offset.size() - 1;
+  const int src_seq_len = count / attn->dims()[0];
+  auto out = param.Out;
+  out->Resize(attn->dims());
+  out->set_lod(attn->lod());
+  auto attn_data = attn->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+  std::vector<int> src_offset_cpu(src_offset.size(), 0);
+  for (int i = 0; i < src_offset.size(); i++) {
+    src_offset_cpu[i] = src_offset[i];
+  }
+  src_offset_cuda.Resize({static_cast<int64_t>(src_offset.size())});
+  auto src_offset_cuda_data = src_offset_cuda.mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(src_offset_cuda_data,
+                                 src_offset_cpu.data(),
+                                 sizeof(int) * src_offset.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  ker_attention_padding_mask<
+      float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      out_data,
+      attn_data,
+      src_offset_cuda_data,
+      attn_seq_num,
+      attn_seq_len,
+      src_seq_num,
+      src_seq_len,
+      param.mask,
+      count);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(attention_padding_mask,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::AttentionPaddingMaskCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
--- a/lite/kernels/cuda/attention_padding_mask_compute.h
+++ b/lite/kernels/cuda/attention_padding_mask_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+class AttentionPaddingMaskCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AttentionPaddingMaskParam;
+  void Run() override;
+  virtual ~AttentionPaddingMaskCompute() = default;
+ private:
+  lite::Tensor src_offset_cuda;
+};
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/cuda/attention_padding_mask_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/attention_padding_mask_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+void attention_padding_mask_ref(
+    const Tensor& x,
+    const Tensor& y,
+    Tensor* out,
+    Tensor* pad_begin,
+    const operators::AttentionPaddingMaskParam& param) {
+  auto attn_offset = x.lod()[0];
+  auto src_offset = y.lod()[0];
+  int attn_seq_num = attn_offset.size() - 1;
+  int src_seq_num = src_offset.size() - 1;
+  int attn_seq_len = attn_offset[1];
+  int src_seq_len = x.dims()[1];
+  CHECK_EQ(attn_seq_num % src_seq_num, 0);
+  auto count = x.numel();
+  auto attn_data = x.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  memcpy(out_data, attn_data, count * sizeof(float));
+  for (int i = 0; i < attn_seq_num; ++i) {
+    for (int j = 0; j < attn_seq_len; ++j) {
+      auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
+      int src_seq_idx = i % src_seq_num;
+      int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
+      for (int k = cur_len; k < src_seq_len; k++) {
+        tmp_out_data[k] = param.mask;
+      }
+    }
+  }
+}
+void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
+  std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
+  x->Resize(x_dims);
+  x->set_lod(lod);
+  auto x_data = x->mutable_data<float>();
+  auto x_num = x->numel();
+  for (int i = 0; i < x_num; i++) {
+    x_data[i] = (i - x_num) * 1.1;
+  }
+}
+int get_max_len(const LoD& lod) {
+  int max_len = 0;
+  auto offset = lod[0];
+  for (int i = 0; i < offset.size() - 1; i++) {
+    int cur_len = offset[i + 1] - offset[i];
+    max_len = max_len < cur_len ? cur_len : max_len;
+  }
+  return max_len;
+}
+TEST(attention_padding_mask_cuda, run_test) {
+  lite::Tensor x, y, x_cpu, y_cpu;
+  lite::Tensor out, pad_begin, out_cpu, out_ref, pad_begin_ref;
+  LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
+  prepare_input(&x_cpu, x_lod, get_max_len(y_lod));
+  prepare_input(&y_cpu, y_lod, 1);
+  x.Resize(x_cpu.dims());
+  x.set_lod(x_cpu.lod());
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Resize(y_cpu.dims());
+  y.set_lod(y_cpu.lod());
+  operators::AttentionPaddingMaskParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.pad_id = 12800001;
+  param.mask = -90000000.f;
+  param.Out = &out;
+  param.pad_begin = &pad_begin;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  AttentionPaddingMaskCompute attention_padding_mask_kernel;
+  attention_padding_mask_kernel.SetParam(param);
+  attention_padding_mask_kernel.SetContext(std::move(ctx));
+  attention_padding_mask_kernel.Run();
+  cudaDeviceSynchronize();
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  out_cpu.Resize(out.dims());
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  attention_padding_mask_ref(x_cpu, y_cpu, &out_ref, &pad_begin_ref, param);
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/bilinear_interp_compute.cu
+++ b/lite/kernels/cuda/bilinear_interp_compute.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "lite/backends/cuda/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/bilinear_interp_compute.h"
@@ -20,6 +21,43 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    lite::Tensor temp;
+    auto temp_data = temp.mutable_data<float>();
+    auto tensor_data = tensor->data<float>();
+    cudaMemcpy(temp_data,
+               tensor_data,
+               tensor->dims().production() * sizeof(float),
+               cudaMemcpyDeviceToHost);
+    vec_new_shape.push_back(static_cast<int32_t>(*temp_data));
+  }
+  return vec_new_shape;
+}
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data<T>();
+  cudaMemcpy(cpu_starts_tensor_data,
+             new_data,
+             new_data_tensor->dims().production() * sizeof(T),
+             cudaMemcpyDeviceToHost);
+  auto new_data_ = cpu_starts_tensor.data<T>();
+  vec_new_data = std::vector<T>(
+      new_data_, new_data_ + new_data_tensor->dims().production());
+  return vec_new_data;
+}
 template <typename T>
 __global__ void BilinearInterp(const T* in,
                               const size_t in_img_h,
@@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() {
  int out_w = param.out_w;
  float scale = param.scale;
  bool align_corners = param.align_corners;
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-  if (out_size != nullptr) {
+  auto list_new_shape_tensor = param.SizeTensor;
-    Tensor sizes;
+  if (list_new_shape_tensor.size() > 0) {
-    float* size_data = sizes.mutable_data<float>();
+    // have size tensor
-    float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
+    auto new_size = get_new_shape(list_new_shape_tensor);
-    cudaMemcpy(
+    out_h = new_size[0];
-        size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+    out_w = new_size[1];
-    out_h = static_cast<int>(size_data[0]);
+  } else {
-    out_w = static_cast<int>(size_data[1]);
+    auto scale_tensor = param.Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+    if (out_size != nullptr) {
+      lite::Tensor sizes;
+      float* size_data = sizes.mutable_data<float>();
+      float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
+      cudaMemcpy(
+          size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+      out_h = static_cast<int>(size_data[0]);
+      out_w = static_cast<int>(size_data[1]);
+    }
  }
  auto output_data = output->mutable_data<float>(TARGET(kCUDA));
  if (in_h == out_h && in_w == out_w) {
    cudaMemcpy(output_data,
               input_data,
@@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp,
               {LiteType::GetTensorTy(TARGET(kCUDA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scale",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kCUDA),
                                       PRECISION(kFloat),

--- a/lite/kernels/cuda/bilinear_interp_compute_test.cc
+++ b/lite/kernels/cuda/bilinear_interp_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include <vector>
 namespace paddle {
 namespace lite {
@@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) {
  }
 }
+TEST(bilinear_interp, update) {
+  BilinearInterpCompute bilinear_interp_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  operators::InterpolateParam param;
+  std::vector<Tensor> size_tensor(2);
+  std::vector<Tensor> size_tensor_cpu(2), size_tensor_ref(2);
+  Tensor x, input_scale, osz, out;
+  Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu;
+  Tensor x_ref, input_scale_ref, osz_ref, out_ref;
+  int n = 1, c = 1, in_h = 3, in_w = 3;
+  int out_h = 6, out_w = 6;
+  float scale = 2.0;
+  param.out_h = out_h;
+  param.out_w = out_w;
+  param.scale = scale;
+  param.align_corners = false;
+  param.align_mode = 0;
+  x.Resize({n, c, in_h, in_w});
+  size_tensor[0].Resize({1});
+  size_tensor[1].Resize({1});
+  input_scale.Resize({1});
+  osz.Resize({2});
+  out.Resize({n, c, out_h, out_w});
+  x_cpu.Resize({n, c, in_h, in_w});
+  size_tensor_cpu[0].Resize({1});
+  size_tensor_cpu[1].Resize({1});
+  input_scale_cpu.Resize({1});
+  osz_cpu.Resize({2});
+  out_cpu.Resize({n, c, out_h, out_w});
+  x_ref.Resize({n, c, in_h, in_w});
+  size_tensor_ref[0].Resize({1});
+  size_tensor_ref[1].Resize({1});
+  input_scale_ref.Resize({1});
+  osz_ref.Resize({2});
+  out_ref.Resize({n, c, out_h, out_w});
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data<float>();
+  float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data<float>();
+  float* input_scale_cpu_data = input_scale_cpu.mutable_data<float>();
+  float* osz_cpu_data = osz_cpu.mutable_data<float>();
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data<float>();
+  float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data<float>();
+  float* input_scale_ref_data = input_scale_ref.mutable_data<float>();
+  float* osz_ref_data = osz_ref.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  osz_cpu_data[0] = out_h;
+  osz_cpu_data[1] = out_w;
+  size_tensor0_cpu_data[0] = out_h;
+  size_tensor1_cpu_data[0] = out_w;
+  input_scale_cpu_data[0] = scale;
+  osz_ref_data[0] = out_h;
+  osz_ref_data[1] = out_w;
+  size_tensor0_ref_data[0] = out_h;
+  size_tensor1_ref_data[0] = out_w;
+  input_scale_ref_data[0] = scale;
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  size_tensor[0].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor0_cpu_data, size_tensor[0].dims());
+  size_tensor[1].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor1_cpu_data, size_tensor[1].dims());
+  input_scale.Assign<float, lite::DDim, TARGET(kCUDA)>(input_scale_cpu_data,
+                                                       input_scale.dims());
+  osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
+  param.X = &x;
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[0]));
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[1]));
+  param.Scale = &input_scale;
+  param.OutSize = &osz;
+  param.Out = &out;
+  bilinear_interp_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  bilinear_interp_kernel.SetContext(std::move(ctx));
+  bilinear_interp_kernel.Launch();
+  cudaDeviceSynchronize();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  for (int i = 0; i < out.numel(); i++) {
+    LOG(INFO) << out_cpu_data[i];
+  }
+}
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/cuda/concat_compute.cu
+++ b/lite/kernels/cuda/concat_compute.cu
@@ -51,6 +51,11 @@ void ConcatCompute<Dtype>::Run() {
  Tensor* output = param.output;
  auto* output_data = output->mutable_data<Dtype>(TARGET(kCUDA));
  int axis = param.axis;
+  Tensor* axis_tensor = param.axis_tensor;
+  if (axis_tensor != nullptr) {
+    const int* axis_tensor_data = axis_tensor->data<int>();
+    axis = axis_tensor_data[0];
+  }
  int inner_size = 1;
  int outer_size = 1;
  auto input_dims = input[0]->dims();
@@ -97,5 +102,7 @@ REGISTER_LITE_KERNEL(concat,
                     paddle::lite::kernels::cuda::ConcatCompute<float>,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
    .Finalize();
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/lookup_table_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+template <int BlockDimX, int BlockDimY, int GridDimX, bool PaddingFlag>
+__global__ void LookupTableKernel(float *output,
+                                  const float *table,
+                                  const int64_t *ids,
+                                  const int64_t N,
+                                  const int64_t K,
+                                  const int64_t D,
+                                  const int64_t padding_idx) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  while (idy < K) {
+    int64_t id = ids[idy];
+    float *out = output + idy * D;
+    const float *tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<float>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+void LookupTableCompute::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  Tensor *w_t = param.W;
+  Tensor *ids_t = param.Ids;
+  Tensor *out_t = param.Out;
+  int64_t padding_idx = param.padding_idx;
+  size_t N = w_t->dims()[0];
+  size_t D = w_t->dims()[1];
+  size_t K = ids_t->numel();
+  auto *w = w_t->data<float>();
+  auto *ids = ids_t->data<int64_t>();
+  auto *out = out_t->mutable_data<float>(TARGET(kCUDA));
+  dim3 threads(128, 8);
+  dim3 grids(8, 1);
+  if (padding_idx == -1) {
+    LookupTableKernel<128, 8, 8, false><<<grids, threads, 0, stream>>>(
+        out, w, ids, N, K, D, padding_idx);
+  } else {
+    LookupTableKernel<128, 8, 8, true><<<grids, threads, 0, stream>>>(
+        out, w, ids, N, K, D, padding_idx);
+  }
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(lookup_table,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .Finalize();
--- a/lite/kernels/cuda/lookup_table_compute.h
+++ b/lite/kernels/cuda/lookup_table_compute.h
--- a/lite/kernels/cuda/lookup_table_compute_test.cc
+++ b/lite/kernels/cuda/lookup_table_compute_test.cc
--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
--- a/lite/kernels/cuda/match_matrix_tensor_compute.h
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
--- a/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
--- a/lite/kernels/cuda/nearest_interp_compute.cu
+++ b/lite/kernels/cuda/nearest_interp_compute.cu
--- a/lite/kernels/cuda/nearest_interp_compute_test.cc
+++ b/lite/kernels/cuda/nearest_interp_compute_test.cc
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
--- a/lite/kernels/cuda/search_seq_depadding_compute.cu
+++ b/lite/kernels/cuda/search_seq_depadding_compute.cu
--- a/lite/kernels/cuda/search_seq_depadding_compute.h
+++ b/lite/kernels/cuda/search_seq_depadding_compute.h
--- a/lite/kernels/cuda/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/cuda/search_seq_depadding_compute_test.cc
--- a/lite/kernels/cuda/search_seq_fc_compute.cu
+++ b/lite/kernels/cuda/search_seq_fc_compute.cu
--- a/lite/kernels/cuda/search_seq_fc_compute.h
+++ b/lite/kernels/cuda/search_seq_fc_compute.h
--- a/lite/kernels/cuda/search_seq_fc_compute_test.cc
+++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc
--- a/lite/kernels/cuda/sequence_arithmetic_compute.cu
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu
--- a/lite/kernels/cuda/sequence_arithmetic_compute.h
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.h
--- a/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
--- a/lite/kernels/cuda/sequence_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
--- a/lite/kernels/cuda/sequence_concat_compute.h
+++ b/lite/kernels/cuda/sequence_concat_compute.h
--- a/lite/kernels/cuda/sequence_concat_compute_test.cc
+++ b/lite/kernels/cuda/sequence_concat_compute_test.cc
--- a/lite/kernels/cuda/sequence_pool_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
--- a/lite/kernels/cuda/sequence_pool_compute.h
+++ b/lite/kernels/cuda/sequence_pool_compute.h
--- a/lite/kernels/cuda/sequence_pool_compute_test.cc
+++ b/lite/kernels/cuda/sequence_pool_compute_test.cc
--- a/lite/kernels/cuda/sequence_reverse_compute.cu
+++ b/lite/kernels/cuda/sequence_reverse_compute.cu
--- a/lite/kernels/cuda/sequence_reverse_compute.h
+++ b/lite/kernels/cuda/sequence_reverse_compute.h
--- a/lite/kernels/cuda/sequence_reverse_compute_test.cc
+++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
--- a/lite/kernels/cuda/var_conv_2d_compute_test.cc
+++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
--- a/lite/kernels/npu/bridges/softmax_op_test.cc
+++ b/lite/kernels/npu/bridges/softmax_op_test.cc
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
--- a/lite/kernels/npu/graph_compute.cc
+++ b/lite/kernels/npu/graph_compute.cc
--- a/lite/kernels/npu/graph_compute.h
+++ b/lite/kernels/npu/graph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/attention_padding_mask_compute.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute.cc
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
--- a/lite/kernels/x86/cast_compute.h
+++ b/lite/kernels/x86/cast_compute.h
--- a/lite/kernels/x86/cast_compute_test.cc
+++ b/lite/kernels/x86/cast_compute_test.cc
--- a/lite/kernels/x86/concat_compute.cc
+++ b/lite/kernels/x86/concat_compute.cc
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
--- a/lite/kernels/x86/lookup_table_compute.cc
+++ b/lite/kernels/x86/lookup_table_compute.cc
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
--- a/lite/kernels/x86/lookup_table_compute_test.cc
+++ b/lite/kernels/x86/lookup_table_compute_test.cc
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
--- a/lite/kernels/x86/match_matrix_tensor_compute.h
+++ b/lite/kernels/x86/match_matrix_tensor_compute.h
--- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
--- a/lite/kernels/x86/reshape_compute.cc
+++ b/lite/kernels/x86/reshape_compute.cc
--- a/lite/kernels/x86/search_aligned_mat_mul_compute.cc
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
--- a/lite/kernels/x86/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.h
--- a/lite/kernels/x86/search_fc_compute.cc
+++ b/lite/kernels/x86/search_fc_compute.cc
--- a/lite/kernels/x86/search_fc_compute.h
+++ b/lite/kernels/x86/search_fc_compute.h
--- a/lite/kernels/x86/search_fc_compute_test.cc
+++ b/lite/kernels/x86/search_fc_compute_test.cc
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
--- a/lite/kernels/x86/search_grnn_compute.h
+++ b/lite/kernels/x86/search_grnn_compute.h
--- a/lite/kernels/x86/search_group_padding_compute.cc
+++ b/lite/kernels/x86/search_group_padding_compute.cc
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
--- a/lite/kernels/x86/search_group_padding_compute_test.cc
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
--- a/lite/kernels/x86/search_seq_depadding_compute.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute.cc
--- a/lite/kernels/x86/search_seq_depadding_compute.h
+++ b/lite/kernels/x86/search_seq_depadding_compute.h
--- a/lite/kernels/x86/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
--- a/lite/kernels/x86/search_seq_fc_compute.cc
+++ b/lite/kernels/x86/search_seq_fc_compute.cc
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute.cc
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
--- a/lite/kernels/x86/sequence_concat_compute.cc
+++ b/lite/kernels/x86/sequence_concat_compute.cc
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_reverse_compute.cc
+++ b/lite/kernels/x86/sequence_reverse_compute.cc
--- a/lite/kernels/x86/sequence_reverse_compute.h
+++ b/lite/kernels/x86/sequence_reverse_compute.h
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
--- a/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
--- a/lite/kernels/x86/slice_compute.cc
+++ b/lite/kernels/x86/slice_compute.cc
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
--- a/lite/kernels/x86/softmax_compute.cc
+++ b/lite/kernels/x86/softmax_compute.cc
--- a/lite/kernels/x86/stack_compute.cc
+++ b/lite/kernels/x86/stack_compute.cc
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/stack_compute_test.cc
+++ b/lite/kernels/x86/stack_compute_test.cc
--- a/lite/kernels/x86/var_conv_2d_compute.cc
+++ b/lite/kernels/x86/var_conv_2d_compute.cc
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
--- a/lite/operators/attention_padding_mask_op.h
+++ b/lite/operators/attention_padding_mask_op.h
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
--- a/lite/operators/fake_channel_wise_dequantize_max_abs.cc
+++ b/lite/operators/fake_channel_wise_dequantize_max_abs.cc
--- a/lite/operators/fake_channel_wise_dequantize_max_abs.h
+++ b/lite/operators/fake_channel_wise_dequantize_max_abs.h
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
--- a/lite/operators/lookup_table_v2_op.cc
+++ b/lite/operators/lookup_table_v2_op.cc
--- a/lite/operators/lookup_table_v2_op.h
+++ b/lite/operators/lookup_table_v2_op.h
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
--- a/lite/operators/match_matrix_tensor_op.h
+++ b/lite/operators/match_matrix_tensor_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/search_aligned_mat_mul_op.cc
+++ b/lite/operators/search_aligned_mat_mul_op.cc
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
--- a/lite/operators/search_grnn_op.h
+++ b/lite/operators/search_grnn_op.h
--- a/lite/operators/search_group_padding_op.cc
+++ b/lite/operators/search_group_padding_op.cc
--- a/lite/operators/search_group_padding_op.h
+++ b/lite/operators/search_group_padding_op.h
--- a/lite/operators/search_seq_depadding_op.cc
+++ b/lite/operators/search_seq_depadding_op.cc
--- a/lite/operators/search_seq_depadding_op.h
+++ b/lite/operators/search_seq_depadding_op.h
--- a/lite/operators/search_seq_fc_op.cc
+++ b/lite/operators/search_seq_fc_op.cc
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
--- a/lite/operators/search_seq_softmax_op.cc
+++ b/lite/operators/search_seq_softmax_op.cc
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
--- a/lite/operators/sequence_arithmetic_op.cc
+++ b/lite/operators/sequence_arithmetic_op.cc
--- a/lite/operators/sequence_arithmetic_op.h
+++ b/lite/operators/sequence_arithmetic_op.h
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
--- a/lite/operators/sequence_concat_op.h
+++ b/lite/operators/sequence_concat_op.h
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
--- a/lite/operators/sequence_reverse_op.h
+++ b/lite/operators/sequence_reverse_op.h
--- a/lite/operators/sequence_topk_avg_pooling_op.cc
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
--- a/lite/operators/sequence_topk_avg_pooling_op.h
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/var_conv_2d_op.h
+++ b/lite/operators/var_conv_2d_op.h
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ b/lite/tests/kernels/bilinear_interp_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
--- a/lite/tests/kernels/lrn_compute_test.cc
+++ b/lite/tests/kernels/lrn_compute_test.cc
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ b/lite/tests/kernels/nearest_interp_compute_test.cc
--- a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
+++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
--- a/lite/tests/kernels/search_seq_fc_compute_test.cc
+++ b/lite/tests/kernels/search_seq_fc_compute_test.cc
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/utils/CMakeLists.txt
+++ b/lite/utils/CMakeLists.txt
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
--- a/lite/utils/cv/image2tensor.cc
+++ b/lite/utils/cv/image2tensor.cc
--- a/lite/utils/cv/image2tensor.h
+++ b/lite/utils/cv/image2tensor.h
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
--- a/lite/utils/cv/image_convert.h
+++ b/lite/utils/cv/image_convert.h
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
--- a/lite/utils/cv/image_flip.h
+++ b/lite/utils/cv/image_flip.h
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
--- a/lite/utils/cv/image_resize.h
+++ b/lite/utils/cv/image_resize.h
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
--- a/lite/utils/cv/image_rotate.h
+++ b/lite/utils/cv/image_rotate.h
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
--- a/mobile/src/fpga/V2/api.cpp
+++ b/mobile/src/fpga/V2/api.cpp
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ b/mobile/src/operators/math/depthwise_conv3x3.cpp