diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 9ff908a4c87d55e87468a06ae0e6085ac165a1b1..a5d3d572181bcd7555d112961eac497e1195cfe3 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -174,15 +174,26 @@ if(NOT WITH_DSO)
     endif(WIN32)
 endif(NOT WITH_DSO)
 
-get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY)
-function(import_static_library alias path)
+function(add_cuda_static_lib alias cuda_lib_paths file_name)
+    unset(ABS_PATH CACHE)
+    find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
     add_library(${alias} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path})
+    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
+    set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
+    if (NOT ABS_PATH)
+      message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
+    endif()
 endfunction()
-import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a)
-import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a)
-import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a)
-import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a)
+
+add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
+add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
+add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
+add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
+if(NOT ${CUDA_VERSION} LESS 10.1)
+  add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
+endif()
+
+set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})
 
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 842b94d47e75b4bab577a1150cb3d198eb42ebaf..574baa86a82963ffa76795e029a6ba14f537c80a 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/lib/${TARGET_ARCH}-linux-gnu/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib
 	${CUDA_TOOLKIT_ROOT_DIR}
-	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	${CUDA_TOOLKIT_ROOT_DIR}/lib64
 	)
 
 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index bd0d117a633824d93c403b8167ff49505160069b..599e7bba7eaf12da7506ce44e706bd9f50ec6998 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,5 +1,6 @@
 INCLUDE(ExternalProject)
 
+SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
@@ -16,9 +17,12 @@ if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
@@ -29,12 +33,14 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
-        DOWNLOAD_NAME   "eigen"
+        DOWNLOAD_NAME   "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 1d61154c0d45dea795902d6544deb796693db263..5166b494c489e25c970c7dbfe72fa1404302009f 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -20,6 +20,7 @@ endif()
 
 include(ExternalProject)
 
+SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak)
 set(XBYAK_PROJECT       extern_xbyak)
 set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
 set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
@@ -38,8 +39,11 @@ ExternalProject_Add(
     ${XBYAK_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ""
-    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
     GIT_TAG             "v5.661"  # Jul 26th
+    URL                 http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip
+    DOWNLOAD_DIR        ${XBYAK_SOURCECODE_DIR}
+    DOWNLOAD_NAME   "xbyak-5.66.zip"
+    DOWNLOAD_NO_PROGRESS 1
     PREFIX              ${XBYAK_PREFIX_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 23b1e02108642df561948a6faa3152effb7ca932..fdc20351e8bcdf5fe8e95db3516f4c6f607611db 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -1,5 +1,6 @@
 INCLUDE(ExternalProject)
 
+SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
@@ -18,10 +19,12 @@ if(WIN32)
   ExternalProject_Add(
           extern_xxhash
           ${EXTERNAL_PROJECT_LOG_ARGS}
-          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
           GIT_TAG         "v0.6.5"
+          URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+          DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+          DOWNLOAD_NAME   "xxHash-0.6.5.zip"
+          DOWNLOAD_NO_PROGRESS  1
           PREFIX          ${XXHASH_SOURCE_DIR}
-          DOWNLOAD_NAME   "xxhash"
           UPDATE_COMMAND  ""
           BUILD_IN_SOURCE 1
           PATCH_COMMAND
@@ -41,10 +44,12 @@ else()
   ExternalProject_Add(
       extern_xxhash
       ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
       GIT_TAG         "v0.6.5"
+      URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+      DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+      DOWNLOAD_NO_PROGRESS  1
       PREFIX          ${XXHASH_SOURCE_DIR}
-      DOWNLOAD_NAME   "xxhash"
+      DOWNLOAD_NAME   "xxHash-0.6.5.zip"
       UPDATE_COMMAND  ""
       CONFIGURE_COMMAND ""
       BUILD_IN_SOURCE 1
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 415eb451a986cd7e59829b9a8f2c744ecf464bd6..225a3c19a16435c4df6403ff7d1bdd01e628dd72 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
     if(nv_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
-gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
+       gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 4423e27e1af4d7bf0f0cc9e60858b8144fc3648d..3b9b4ece23266ce818e02c50ac2cd53c8771762a 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -164,7 +164,9 @@ function(lite_cc_library TARGET)
 endfunction()
 
 function(lite_cc_binary TARGET)
-    set(options "")
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        set(options " -g ")
+    endif()
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
@@ -255,6 +257,7 @@ endfunction()
 
 set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
+set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 173f04126eccbe0ec324c6e19ea8f21f278fd539..036df2a824c3b696c892cb7462f9afb4a3e2a10a 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
 message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
 message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
+message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
@@ -121,6 +122,9 @@ if (LITE_WITH_X86)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
 endif()
 
+if(LITE_WITH_CUDA)
+    add_dependencies(publish_inference paddle_full_api_shared)
+endif(LITE_WITH_CUDA) 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     )
             add_dependencies(tiny_publish_lib bundle_light_api)
             add_dependencies(publish_inference tiny_publish_lib)
@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     )
                 add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
                 add_dependencies(publish_inference tiny_publish_cxx_lib)
+                add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                            COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
             endif()
         endif()
     endif()
@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
         endif()
     endif()
 
-    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
+    if ((ARM_TARGET_OS STREQUAL "android") AND
             ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
         if (NOT LITE_ON_TINY_PUBLISH)
             # copy
@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
             )
             add_dependencies(publish_inference_android_cxx_demos logging gflags)
             add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+
             )
             add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
         endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index aef0fc396e6eb35a7ef85a8f2fc13651237e19a3..e660bbcdd606133db4e7891b6973f26983b4dd79 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
     add_library(paddle_full_api_shared SHARED "")
     target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
     endif()
-    
+    if(LITE_WITH_CUDA)
+        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
+    endif(LITE_WITH_CUDA) 
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared MODULE
     SRCS light_api_shared.cc
@@ -65,6 +67,7 @@ endif()
 
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
+message(STATUS "get CUDA kernels ${cuda_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH)
                     ARM_DEPS ${arm_kernels}
                     NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
                     XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
-                    CL_DEPS ${opencl_kenrels}
-                    FPGA_DEPS ${fpga_kenrels}
-                    BM_DEPS ${bm_kenrels})
+                    CL_DEPS ${opencl_kernels}
+                    FPGA_DEPS ${fpga_kernels})
+                    BM_DEPS ${bm_kernels})
 endif()
 
 # for light api
 set(light_api_deps
     scope target_wrapper_host model_parser program)
 if(LITE_WITH_CUDA)
+    get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
-    set(cuda_static_deps cudart_static cublas_static curand_static
-        cudnn_static culibos_static)
 endif()
 lite_cc_library(light_api SRCS light_api.cc
         DEPS scope target_wrapper_host model_parser
@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
-        CL_DEPS ${opencl_kenrels}
-        FPGA_DEPS ${fpga_kenrels}
-        BM_DEPS ${bm_kenrels})
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
+        BM_DEPS ${bm_kernels})
 
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -305,9 +307,10 @@ if(NOT IOS)
     NPU_DEPS ${npu_kernels}
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
     BM_DEPS ${bm_kernels}
-    X86_DEPS ${x86_kernels})
+    FPGA_DEPS ${fpga_kernels}
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
   lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
     ${ops} ${host_kernels}
     ARM_DEPS ${arm_kernels}
@@ -316,7 +319,9 @@ if(NOT IOS)
     CL_DEPS ${opencl_kernels}
     BM_DEPS ${bm_kernels}
     FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
+
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 462a5e2381acf3cc86ca81002a282933f01ee049..c137324b576f9f9399669a5e68d948b9921e4866 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir,
                     const std::vector<std::vector<int64_t>>& input_shapes) {
   lite_api::CxxConfig config;
   config.set_model_dir(load_model_dir);
-  std::vector<Place> vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)},
-                                     Place{TARGET(kX86), PRECISION(kFloat)},
-                                     Place{TARGET(kOpenCL), PRECISION(kFloat)}};
+  std::vector<Place> vaild_places = {
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kX86), PRECISION(kFloat)},
+  };
   if (FLAGS_is_quantized_model) {
     vaild_places.insert(vaild_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index a2b538aa77e0603f439b6b23aab875103fdbbff0..4647f20bbe476d8763f94f707f3d88da7c7544df 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -24,13 +24,6 @@
 namespace paddle {
 namespace lite {
 
-static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
-    ".tailored_ops_source_list";
-static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
-static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
-    ".tailored_kernels_source_list";
-static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
-
 void Predictor::SaveModel(const std::string &dir,
                           lite_api::LiteModelType model_type,
                           bool record_info) {
@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
+
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
+
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
+  std::vector<const cpp::OpDesc *> feeds;
+  std::vector<const cpp::OpDesc *> fetchs;
+#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
+  // The shape of input tensors must be determined before generating NPU and XPU
+  // program.
   auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc *> feeds;
-  std::vector<cpp::OpDesc *> fetchs;
   for (size_t i = 0; i < current_block->OpsSize(); i++) {
     auto op = current_block->GetOp<cpp::OpDesc>(i);
+#else
+  if (!program_) {
+    GenRuntimeProgram();
+  }
+  const auto &insts = program_->instructions();
+  for (size_t i = 0; i < program_->num_instructions(); i++) {
+    const auto &op = insts[i].op()->op_info();
+#endif
     if (op->Type() == "feed") {
       feeds.push_back(op);
     } else if (op->Type() == "fetch") {
       fetchs.push_back(op);
     }
   }
+
   input_names_.resize(feeds.size());
   output_names_.resize(fetchs.size());
   for (size_t i = 0; i < feeds.size(); i++) {
@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
 const cpp::ProgramDesc &Predictor::program_desc() const {
   return program_desc_;
 }
+
 const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
 
 void Predictor::Build(const lite_api::CxxConfig &config,
@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
                       const std::vector<Place> &valid_places,
                       const std::vector<std::string> &passes) {
   program_desc_ = desc;
+  // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
   inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
   inner_places.emplace_back(
       TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
   Program program(desc, scope_, inner_places);
-  /// The first place in valid_places is
+
   core::KernelPickFactor factor;
   factor.ConsiderTarget();
   factor.ConsiderPrecision();
   factor.ConsiderDataLayout();
+
   optimizer_.Run(std::move(program), inner_places, factor, passes);
   exec_scope_ = optimizer_.exec_scope();
   PrepareFeedFetch();
@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
   auto *var = exec_scope_->FindVar(name);
   return &var->Get<lite::Tensor>();
 }
+
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
   auto element = std::find(input_names_.begin(), input_names_.end(), name);
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 502ce812e1f4a7f520e89e6eaff020c5853f5308..504710d9fa29420b8762f31e0c675b59c6c626bd 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -29,6 +29,13 @@
 namespace paddle {
 namespace lite {
 
+static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
+    ".tailored_ops_source_list";
+static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
+static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
+    ".tailored_kernels_source_list";
+static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+
 /*
  * Predictor for inference, input a model, it will optimize and execute it.
  */
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
index 63a401745b325654f81c3af93402703395264c0d..79f9bea762e099b249f597dddb7df790361edc2a 100644
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
   });
 
   TestModel(valid_places);
diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc
index 1aef522b2a6bb95f895449469f3c13e4a713179a..1c426e8568cf71b6f48edbbeb8a93fec2e89c594 100644
--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -20,6 +20,7 @@
 // model_optimize_tool's compiling period
 #include "all_kernel_faked.cc"  // NOLINT
 #include "kernel_src_map.h"     // NOLINT
+#include "lite/api/cxx_api.h"
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
@@ -31,6 +32,18 @@ DEFINE_string(model_dir,
               "",
               "path of the model. This option will be ignored if model_file "
               "and param_file are exist");
+DEFINE_string(model_filename,
+              "",
+              "model topo filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(param_filename,
+              "",
+              "model param filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(model_set_dir,
+              "",
+              "path of the models set. This option will be used to specific"
+              " tailoring");
 DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
@@ -58,29 +71,23 @@ void DisplayKernels() {
   LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
 }
 
-void Main() {
-  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
-    LOG(WARNING)
-        << "Load combined-param model. Option model_dir will be ignored";
-  }
-
-  if (FLAGS_display_kernels) {
-    DisplayKernels();
-    exit(0);
-  }
-
-  lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_model_file(FLAGS_model_file);
-  config.set_param_file(FLAGS_param_file);
-
+std::vector<Place> ParserValidPlaces() {
   std::vector<Place> valid_places;
-  auto target_reprs = lite::Split(FLAGS_valid_targets, " ");
+  auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
   for (auto& target_repr : target_reprs) {
     if (target_repr == "arm") {
       valid_places.emplace_back(TARGET(kARM));
     } else if (target_repr == "opencl") {
-      valid_places.emplace_back(TARGET(kOpenCL));
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
     } else if (target_repr == "x86") {
       valid_places.emplace_back(TARGET(kX86));
     } else {
@@ -100,26 +107,130 @@ void Main() {
     valid_places.insert(valid_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
   }
+  return valid_places;
+}
+
+void RunOptimize(const std::string& model_dir,
+                 const std::string& model_file,
+                 const std::string& param_file,
+                 const std::string& optimize_out,
+                 const std::string& optimize_out_type,
+                 const std::vector<Place>& valid_places,
+                 bool record_tailoring_info) {
+  if (!model_file.empty() && !param_file.empty()) {
+    LOG(WARNING)
+        << "Load combined-param model. Option model_dir will be ignored";
+  }
+
+  lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_model_file(model_file);
+  config.set_param_file(param_file);
+
   config.set_valid_places(valid_places);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   LiteModelType model_type;
-  if (FLAGS_optimize_out_type == "protobuf") {
+  if (optimize_out_type == "protobuf") {
     model_type = LiteModelType::kProtobuf;
-  } else if (FLAGS_optimize_out_type == "naive_buffer") {
+  } else if (optimize_out_type == "naive_buffer") {
     model_type = LiteModelType::kNaiveBuffer;
   } else {
-    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
   }
-  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
 
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
   predictor->SaveOptimizedModel(
-      FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info);
-  if (FLAGS_record_tailoring_info) {
+      optimize_out, model_type, record_tailoring_info);
+  if (record_tailoring_info) {
     LOG(INFO) << "Record the information of tailored model into :"
-              << FLAGS_optimize_out;
+              << optimize_out;
+  }
+}
+
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+
+void Main() {
+  if (FLAGS_display_kernels) {
+    DisplayKernels();
+    exit(0);
   }
+
+  auto valid_places = ParserValidPlaces();
+  if (FLAGS_model_set_dir == "") {
+    RunOptimize(FLAGS_model_dir,
+                FLAGS_model_file,
+                FLAGS_param_file,
+                FLAGS_optimize_out,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    return;
+  }
+
+  if (!FLAGS_record_tailoring_info) {
+    LOG(WARNING) << "--model_set_dir option only be used with "
+                    "--record_tailoring_info=true together";
+    return;
+  }
+
+  auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
+  }
+  // Optimize models in FLAGS_model_set_dir
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({FLAGS_model_set_dir, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({FLAGS_optimize_out, name}, "/");
+
+    std::string model_file = "";
+    std::string param_file = "";
+
+    if (FLAGS_model_filename != "" && FLAGS_param_filename != "") {
+      model_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_model_filename}, "/");
+      param_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_param_filename}, "/");
+    }
+
+    LOG(INFO) << "Start optimize model: " << input_model_dir;
+    RunOptimize(input_model_dir,
+                model_file,
+                param_file,
+                output_model_dir,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    LOG(INFO) << "Optimize done. ";
+  }
+
+  // Collect all models information
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+  CollectModelMetaInfo(FLAGS_optimize_out,
+                       model_dirs,
+                       lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
 }
 
 }  // namespace lite_api
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index 1358267000991c81b80453669cf46638449b8a7b..a04e86b7d2a1e06a52c38b5f00e9c07966be1bfe 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -21,14 +21,14 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
-#include "lite/tests/utils/timer.h"
+#include "lite/core/profile/timer.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_string(input_shape,
               "1,3,224,224",
@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 
   Timer ti;
   for (int j = 0; j < repeat; ++j) {
-    ti.start();
+    ti.Start();
     predictor->Run();
-    ti.end();
-    LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms";
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir
             << ", power_mode: " << static_cast<int>(power_mode)
             << ", threads num " << thread_num << ", warmup: " << warmup_times
-            << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms()
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
             << " ms"
-            << ", min time: " << ti.get_min_time() << " ms"
-            << ", max time: " << ti.get_max_time() << " ms.";
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
 
   auto output = predictor->GetOutput(0);
   auto out = output->data<float>();
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index f148096bb69a3a249521bcb847d5beae3f8297f9..aabb53529221bde53b6b2ee27b2efefee2e6054d 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) {
   }
 }
 template <typename T>
-void Tensor::CopyToCpu(T *data) {
+void Tensor::CopyToCpu(T *data) const {
   const T *src_data = tensor(raw_tensor_)->data<T>();
   int64_t num = tensor(raw_tensor_)->numel();
   CHECK(num > 0) << "You should call Resize interface first";
@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
 template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
 template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
 
-template void Tensor::CopyToCpu(int8_t *);
-template void Tensor::CopyToCpu(float *);
-template void Tensor::CopyToCpu(int *);
+template void Tensor::CopyToCpu(int8_t *) const;
+template void Tensor::CopyToCpu(float *) const;
+template void Tensor::CopyToCpu(int *) const;
 
 shape_t Tensor::shape() const {
   return ctensor(raw_tensor_)->dims().Vectorize();
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 42b455da811fe1a21277d38f2e1237000276b1ff..c578769bd5159d27ad43e4e93de33f601223004b 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -49,7 +49,7 @@ struct LITE_API Tensor {
   void CopyFromCpu(const T* data);
 
   template <typename T>
-  void CopyToCpu(T* data);
+  void CopyToCpu(T* data) const;
   /// Shape of the tensor.
   shape_t shape() const;
   TargetType target() const;
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 3d7d496afbc55e1dfdfe83d123c7e41dd59bf1ff..894d839185ea9e1b6b47b87c398f249f044c2b51 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) {
                                               "any",
                                               "fpga",
                                               "npu",
-                                              "xpu",
-                                              "bm"};
+                                              "xpu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) {
                                               "kAny",
                                               "kFPGA",
                                               "kNPU",
-                                              "kXPU",
-                                              "kBM"};
+                                              "kXPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kOpenCL),
                                                TARGET(kNPU),
                                                TARGET(kXPU),
-                                               TARGET(kFPGA),
-                                               TARGET(kBM)});
+                                               TARGET(kFPGA)});
   if (target == TARGET(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index a13abb699cea36ba53e430668e8dcd6d19d46d9e..07284be095c05e5dfa069b0973d5982cf1f07c8a 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -52,9 +52,8 @@ enum class TargetType : int {
   kFPGA = 7,
   kNPU = 8,
   kXPU = 9,
-  kBM = 10,
   kAny = 6,  // any target
-  NUM = 11,  // number of fields.
+  NUM = 10,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 70355fdf890eb63cd5bedd5bab42a2dd69af0927..9d56d262abf549584819ab893144e41fc399439f 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-USE_MIR_PASS(subgraph_program_pass);
+#ifdef LITE_WITH_NPU
+USE_MIR_PASS(generate_npu_program_pass);
+#endif
+#ifdef LITE_WITH_XPU
+USE_MIR_PASS(generate_xpu_program_pass);
+#endif
 
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc
index c483373dc745f6520d51ece3936448ada71990d3..5314c5ed75d862635a1b87cdad33bf3c58dcd6cc 100644
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -12,20 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index cbbcf49a5fd55dabd6b072bc6b3b2e3f9bb91a13..076c791daab182c4eff477a621ecd2ec52a0c3e7 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -57,9 +57,10 @@ endif()
 
 if (NOT HAS_ARM_MATH_LIB_DIR)
   # TODO(xxx): seperate them and do not deps proto, eigen3
-  cc_library(math_arm SRCS  
-      funcs.cc 
+  cc_library(math_arm SRCS
+      funcs.cc
       packed_sgemm.cc
+      packed_sgemm_c4.cc
       sgemm.cc
       gemm_prepacked_int8.cc
       gemm_s8.cc
@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       gemv_arm_int8.cc
       conv3x3s1_direct_fp32.cc
       conv3x3s2_direct_fp32.cc
-      conv3x3s1_depthwise_fp32.cc
-      conv3x3s2_depthwise_fp32.cc
+      conv3x3s1p01_depthwise_fp32.cc
+      conv3x3s2p01_depthwise_fp32.cc
+      conv3x3s1px_depthwise_fp32.cc
+      conv3x3s2px_depthwise_fp32.cc
       conv3x3s1_direct_int8.cc
       conv3x3s2_direct_int8.cc
       conv3x3s1_depthwise_int8.cc
@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       conv5x5s1_depthwise_int8.cc
       conv5x5s1_depthwise_fp32.cc
       conv5x5s2_depthwise_fp32.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_3x3s1.cc
-      conv_depthwise_3x3s2.cc
+      conv3x3_winograd_fp32_c4.cc
       conv_winograd_3x3.cc
       conv_impl.cc
-      softmax.cc 
+      softmax.cc
       scale.cc
       pooling.cc
       elementwise.cc
+      layout.cc
       lrn.cc
       decode_bboxes.cc
       concat.cc
@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       anchor_generator.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
- 
diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc
index b5d2c6af13cc1dd864eaac6cb6589cc879f029fe..38be1d689dd47ab59baf417e40989a91bb6366e0 100644
--- a/lite/backends/arm/math/col_im_transform.cc
+++ b/lite/backends/arm/math/col_im_transform.cc
@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col,
                    const int width,
                    const int kernel_h,
                    const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                    const int stride_h,
                    const int stride_w,
                    const int dilation_h,
@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col,
                    float* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(float));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 data_im[input_row * width + input_col] += *data_col;
diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h
index 8560679d7f4091c4cb424b54e54a42cf6e7e8905..e3e32c4715ade10972f77e0c4d5a2cd4d16b4725 100644
--- a/lite/backends/arm/math/col_im_transform.h
+++ b/lite/backends/arm/math/col_im_transform.h
@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col,
             const int width,
             const int kernel_h,
             const int kernel_w,
-            const int pad_h,
-            const int pad_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
             const int stride_h,
             const int stride_w,
             const int dilation_h,
diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5834461b8fe0b2d37f174d5f66269fb58f2504a1
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -0,0 +1,564 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void input_trans_c4(const float* src,
+                    int src_stride,
+                    float* dest,
+                    int dest_stride);
+void output_trans_c4(const float* src,
+                     int src_stride,
+                     float* dest,
+                     int dest_stride);
+void output_trans_c4_post(const float* src,
+                          int src_stride,
+                          float* dest,
+                          int dest_stride,
+                          float* bias_value,
+                          bool has_relu);
+void weight_trans_c4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+
+/*
+*The following function conv_compute_6x6_3x3 is base on
+*MNN[https://github.com/alibaba/MNN]
+*
+*Copyright © 2018, Alibaba Group Holding Limited
+*/
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx) {
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 5) / 6;
+  int tile_h = (hout + 5) / 6;
+  int size_tile = tile_h * tile_w;
+  float zero_ptr[8];
+  memset(zero_ptr, 0, 8 * sizeof(float));
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+#ifdef __aarch64__
+  tile_block = 16;
+#endif
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 256;
+  memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 256;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+    // trans input to c4
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+#pragma omp parallel for num_threads(threads)
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+#ifdef ARM_WITH_OMP
+      float* tmp_data =
+          g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride;
+      float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 256;
+      float* trans_remain_tmp_data =
+          g_trans_remain_tmp_data + omp_get_thread_num() * 256;
+#else
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+#endif
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index * 6;
+        int src_y = th_index * 6;
+        int ex = src_x + 8 > w_pad ? w_pad - src_x : 8;
+        int ey = src_y + 8 > h_pad ? h_pad - src_y : 8;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 8 && ey == 8) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            for (int i = 0; i < 8; ++i) {
+              const float* ci_ptr = src_ci + i * w_pad * 4;
+              input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+            }
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              input_trans_c4(trans_tmp_data + i * 32,
+                             4,
+                             dst_ci + i * b_gi_stride * 8,
+                             b_gi_stride);
+            }
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 256 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 32;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            // trans
+            for (int i = 0; i < 8; ++i) {
+              float* ci_ptr = trans_remain_tmp_data + i * 32;
+              input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+            }
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              input_trans_c4(trans_tmp_data + i * 32,
+                             4,
+                             dst_ci + i * b_gi_stride * 8,
+                             b_gi_stride);
+            }
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 256;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+      for (int gi = 0; gi < 64; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(oc_4 * 4,
+                               tile_count,
+                               ic_4 * 4,
+                               origin_A,
+                               origin_B,
+                               origin_C,
+                               nullptr,
+                               false,
+                               false,
+                               ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 6;
+        int dst_y = th_index * 6;
+
+        int ex = dst_x + 6 > wout ? wout - dst_x : 6;
+        int ey = dst_y + 6 > hout ? hout - dst_y : 6;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 6) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              output_trans_c4(src_ci + i * c_gi_stride * 8,
+                              c_gi_stride,
+                              trans_tmp_data + i * 4,
+                              32);
+            }
+            for (int i = 0; i < ey; ++i) {
+              output_trans_c4_post(trans_tmp_data + i * 32,
+                                   4,
+                                   trans_remain_tmp_data + i * 24,
+                                   4,
+                                   bias_value,
+                                   param.fuse_relu);
+            }
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              output_trans_c4(src_ci + i * c_gi_stride * 8,
+                              c_gi_stride,
+                              trans_tmp_data + i * 4,
+                              32);
+            }
+            for (int i = 0; i < ey; ++i) {
+              output_trans_c4_post(trans_tmp_data + i * 32,
+                                   4,
+                                   trans_remain_tmp_data + i * 24,
+                                   4,
+                                   bias_value,
+                                   param.fuse_relu);
+            }
+            // copy to dest
+            memset(trans_tmp_data, 0, 144 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 24,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+
+void output_trans_c4(const float* src,
+                     int src_stride,
+                     float* dest,
+                     int dest_stride) {
+  const float32x4_t src0 = vld1q_f32(src);
+  const float32x4_t src1 = vld1q_f32(src + src_stride);
+  const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  const float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  const float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  const float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  const float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  const float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t tmp024a = vaddq_f32(src1, src2);
+  float32x4_t tmp135a = vsubq_f32(src1, src2);
+  float32x4_t tmp024b = vaddq_f32(src3, src4);
+  float32x4_t tmp135b = vsubq_f32(src3, src4);
+  float32x4_t tmp024c = vaddq_f32(src5, src6);
+  float32x4_t tmp135c = vsubq_f32(src5, src6);
+
+  float32x4_t dest0 =
+      vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c);
+  float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)),
+                                vmulq_n_f32(tmp024c, 0.25f));
+  float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)),
+                                vmulq_n_f32(tmp024c, 0.0625f));
+
+  float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)),
+                                vmulq_n_f32(tmp135c, 0.5f));
+  float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)),
+                                vmulq_n_f32(tmp135c, 0.125f));
+  float32x4_t dest5 =
+      vaddq_f32(src7,
+                vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)),
+                          vmulq_n_f32(tmp135c, 0.03125f)));
+
+  vst1q_f32(dest, dest0);
+  vst1q_f32(dest + dest_stride, dest1);
+  vst1q_f32(dest + dest_stride * 2, dest2);
+  vst1q_f32(dest + dest_stride * 3, dest3);
+  vst1q_f32(dest + dest_stride * 4, dest4);
+  vst1q_f32(dest + dest_stride * 5, dest5);
+}
+void output_trans_c4_post(const float* src,
+                          int src_stride,
+                          float* dest,
+                          int dest_stride,
+                          float* bias_value,
+                          bool has_relu = false) {
+  const float32x4_t src0 = vld1q_f32(src);
+  const float32x4_t src1 = vld1q_f32(src + src_stride);
+  const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  const float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  const float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  const float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  const float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  const float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t tmp024a = vaddq_f32(src1, src2);
+  float32x4_t tmp135a = vsubq_f32(src1, src2);
+  float32x4_t tmp024b = vaddq_f32(src3, src4);
+  float32x4_t tmp135b = vsubq_f32(src3, src4);
+  float32x4_t tmp024c = vaddq_f32(src5, src6);
+  float32x4_t tmp135c = vsubq_f32(src5, src6);
+
+  float32x4_t dest0 =
+      vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c);
+  float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)),
+                                vmulq_n_f32(tmp024c, 0.25f));
+  float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)),
+                                vmulq_n_f32(tmp024c, 0.0625f));
+
+  float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)),
+                                vmulq_n_f32(tmp135c, 0.5f));
+  float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)),
+                                vmulq_n_f32(tmp135c, 0.125f));
+  float32x4_t dest5 =
+      vaddq_f32(src7,
+                vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)),
+                          vmulq_n_f32(tmp135c, 0.03125f)));
+
+  if (bias_value) {
+    float32x4_t bias = vld1q_f32(bias_value);
+    dest0 = vaddq_f32(dest0, bias);
+    dest1 = vaddq_f32(dest1, bias);
+    dest2 = vaddq_f32(dest2, bias);
+    dest3 = vaddq_f32(dest3, bias);
+    dest4 = vaddq_f32(dest4, bias);
+    dest5 = vaddq_f32(dest5, bias);
+  }
+
+  if (has_relu) {
+    float32x4_t zeros = vdupq_n_f32(0);
+    dest0 = vmaxq_f32(dest0, zeros);
+    dest1 = vmaxq_f32(dest1, zeros);
+    dest2 = vmaxq_f32(dest2, zeros);
+    dest3 = vmaxq_f32(dest3, zeros);
+    dest4 = vmaxq_f32(dest4, zeros);
+    dest5 = vmaxq_f32(dest5, zeros);
+  }
+
+  vst1q_f32(dest, dest0);
+  vst1q_f32(dest + dest_stride, dest1);
+  vst1q_f32(dest + dest_stride * 2, dest2);
+  vst1q_f32(dest + dest_stride * 3, dest3);
+  vst1q_f32(dest + dest_stride * 4, dest4);
+  vst1q_f32(dest + dest_stride * 5, dest5);
+}
+
+void input_trans_c4(const float* src,
+                    int src_stride,
+                    float* dest,
+                    int dest_stride) {
+  float32x4_t src0 = vld1q_f32(src);
+  float32x4_t src1 = vld1q_f32(src + src_stride);
+  float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t dst0 = vaddq_f32(vsubq_f32(src0, src6),
+                               vmulq_n_f32(vsubq_f32(src4, src2), 5.25));
+  float32x4_t dst7 = vaddq_f32(vsubq_f32(src7, src1),
+                               vmulq_n_f32(vsubq_f32(src3, src5), 5.25));
+
+  float32x4_t tmp12a =
+      vsubq_f32(vaddq_f32(src2, src6), vmulq_n_f32(src4, 4.25));
+  float32x4_t tmp12b =
+      vsubq_f32(vaddq_f32(src1, src5), vmulq_n_f32(src3, 4.25));
+  float32x4_t dst1 = vaddq_f32(tmp12a, tmp12b);
+  float32x4_t dst2 = vsubq_f32(tmp12a, tmp12b);
+
+  float32x4_t tmp34a = vsubq_f32(vaddq_f32(src6, vmulq_n_f32(src2, 0.25)),
+                                 vmulq_n_f32(src4, 1.25));
+  float32x4_t tmp34b =
+      vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 0.5), vmulq_n_f32(src3, 2.5)),
+                vmulq_n_f32(src5, 2));
+  float32x4_t dst3 = vaddq_f32(tmp34a, tmp34b);
+  float32x4_t dst4 = vsubq_f32(tmp34a, tmp34b);
+
+  float32x4_t tmp56a =
+      vaddq_f32(src6, vmulq_n_f32(vsubq_f32(src2, vmulq_n_f32(src4, 1.25)), 4));
+  float32x4_t tmp56b =
+      vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 2), vmulq_n_f32(src3, 2.5)),
+                vmulq_n_f32(src5, 0.5));
+  float32x4_t dst5 = vaddq_f32(tmp56a, tmp56b);
+  float32x4_t dst6 = vsubq_f32(tmp56a, tmp56b);
+
+  vst1q_f32(dest, dst0);
+  vst1q_f32(dest + dest_stride, dst1);
+  vst1q_f32(dest + dest_stride * 2, dst2);
+  vst1q_f32(dest + dest_stride * 3, dst3);
+  vst1q_f32(dest + dest_stride * 4, dst4);
+  vst1q_f32(dest + dest_stride * 5, dst5);
+  vst1q_f32(dest + dest_stride * 6, dst6);
+  vst1q_f32(dest + dest_stride * 7, dst7);
+}
+void weight_trans_c4(
+    float* dest, const float* din, int ch_in, int ch_out, void* workspace) {
+  const float coeff[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {32.0f / 45, 16.0f / 45, 8.0f / 45},
+                             {32.0f / 45, -16.0f / 45, 8.0f / 45},
+                             {0.0f, 0.0f, 1.0f}};
+
+  float* ptr_out = static_cast<float*>(workspace);
+
+  for (int i = 0; i < ch_out; i++) {
+    for (int j = 0; j < ch_in; j++) {
+      const float* kernel0 =
+          static_cast<const float*>(din) + (i * ch_in + j) * 9;
+      float* ptr_channel = ptr_out + (i * ch_in + j) * 64;
+
+      //! transform kernel, transposed
+      const float* k0 = kernel0;
+      const float* k1 = kernel0 + 3;
+      const float* k2 = kernel0 + 6;
+
+      //! h
+      float tmp[8][3];
+      for (int i = 0; i < 8; i++) {
+        tmp[i][0] =
+            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
+        tmp[i][1] =
+            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
+        tmp[i][2] =
+            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
+      }
+
+      //! v
+      for (int j = 0; j < 8; j++) {
+        float* tmpp = &tmp[j][0];
+        for (int i = 0; i < 8; i++) {
+          ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] +
+                                   tmpp[1] * coeff[i][1] +
+                                   tmpp[2] * coeff[i][2];
+        }
+      }
+    }
+  }
+
+  int oc_pad = (ch_out + 3) / 4 * 4;
+  int ic_pad = (ch_in + 3) / 4 * 4;
+  int c_stride = ic_pad * oc_pad;
+  for (int i = 0; i < ch_out * ch_in * 64; ++i) {
+    int new_c = i % 64;
+    int new_oc = i / ch_in / 64 / 4;
+    int new_ic = i / 64 % (ch_in * 4) % ch_in;
+    int new_inner = i / ch_in / 64 % 4;
+    int dest_ind =
+        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
+    dest[dest_ind] = ptr_out[i];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
index 6a1fa37681585883280625a22c15aec43c6554af..b4972a1ecab151947f8aaa7d6db0f6e82a08e5e4 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param,
   auto dim_in = param.x->dims();
   auto dim_out = param.output->dims();
   const int threads = ctx->threads();
+  auto paddings = *param.paddings;
   int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   int ow = dim_out[3];
   int oh = dim_out[2];
   int ic = dim_in[1];
@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                             ARMContext* ctx) {
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
+  auto paddings = *param.paddings;
 
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
   const int win_round = wout_round + 2;
   bool flag_relu = param.fuse_relu;
diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
index f966313e118acf3f74124aca1d16aa3c50009bb8..64e72bc441bb93fa955e12ff53ce17f0e37b4830 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                             const operators::ConvParam& param,
                             Context<TARGET(kARM)>* ctx,
                             const float* scale) {
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
 
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / 4;
diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4c9fb99ef9a6b5d3987a1efd5a644f322ea043c
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -0,0 +1,2539 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_depthwise_3x3s1p0_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx);
+
+void conv_depthwise_3x3s1p0_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx);
+
+void conv_depthwise_3x3s1_fp32(const float *din,
+                               float *dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float *weights,
+                               const float *bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext *ctx) {
+  if (pad == 0) {
+    if (w_in > 5) {
+      conv_depthwise_3x3s1p0_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s1p0_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+  if (pad == 1) {
+    if (w_in > 4) {
+      conv_depthwise_3x3s1p1_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s1p1_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+}
+
+#ifdef __aarch64__
+#define INIT_S1                                                   \
+  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr1]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr2]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr3]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr4]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr5]] \n"                              \
+  "movi   v21.4s, #0x0\n" /* out0 = 0 */                          \
+                                                                  \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+                                                                  \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/       \
+                                                                  \
+  "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
+
+#define LEFT_COMPUTE_S1                                                   \
+  "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/   \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/         \
+  "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */                \
+  "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */                \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/           \
+  "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/  \
+  "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext v17.16b, v4.16b, v5.16b, #4 \n"         /* v16=1234 */             \
+  "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/           \
+                                                                          \
+  /* r2 */                                                                \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/           \
+  "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
+
+#define LEFT_RESULT_S1                                                      \
+  /* r4 */                                                                  \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/    \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/    \
+                                                                            \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/   \
+                                                                            \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n"            /* v16 = 00123*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+                                                                            \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+                                                                            \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                     \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                    \
+                                                                            \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                     \
+  "cmp  %w[cnt], #1                \n"                                      \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "blt 3f                         \n"
+
+#define MID_COMPUTE_S1                                                    \
+  "1:                             \n"   /* r0 */                          \
+  "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+
+#define MID_RESULT_S1                                                      \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_COMPUTE_S1                                                  \
+  "3:                             \n"                                     \
+  "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
+  "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
+  "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
+  "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
+  "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
+                                                                          \
+  "bif v0.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v1.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v2.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v3.16b, %[vzero].16b, v19.16b \n"                                  \
+                                                                          \
+  "bif v4.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v5.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v6.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v7.16b, %[vzero].16b, v19.16b \n"                                  \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                          \
+  "bif v8.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v9.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v10.16b, %[vzero].16b, v18.16b \n"                                 \
+  "bif v11.16b, %[vzero].16b, v19.16b \n"                                 \
+                                                                          \
+  "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                          \
+  "ld1 {v18.4s}, [%[rmask]]         \n"                                   \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+
+#define RIGHT_RESULT_S1                                                    \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define LEFT_RESULT_S1_RELU                                               \
+  /* r4 */                                                                \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */                   \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"         /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                   \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f                         \n"
+
+#define MID_RESULT_S1_RELU                                                 \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_RESULT_S1_RELU                                               \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+                                                                           \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define COMPUTE_S_S1                       \
+  "prfm pldl1keep, [%[din0]]\n"            \
+  "prfm pldl1keep, [%[din1]]\n"            \
+  "prfm pldl1keep, [%[din2]]\n"            \
+  "prfm pldl1keep, [%[din3]]\n"            \
+                                           \
+  "ld1 {v0.4s}, [%[din0]], #16\n"          \
+  "ld1 {v1.4s}, [%[din1]], #16\n"          \
+  "ld1 {v2.4s}, [%[din2]], #16\n"          \
+  "ld1 {v3.4s}, [%[din3]], #16\n"          \
+                                           \
+  "bif v0.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v1.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v2.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v3.16b, %[zero].16b, %[mask].16b\n" \
+                                           \
+  "ext v4.16b, %[zero].16b, v0.16b, #12\n" \
+  "ext v5.16b, %[zero].16b, v1.16b, #12\n" \
+  "ext v6.16b, %[zero].16b, v2.16b, #12\n" \
+  "ext v7.16b, %[zero].16b, v3.16b, #12\n" \
+                                           \
+  "ext v8.16b, v0.16b, %[zero].16b, #4\n"  \
+  "ext v9.16b, v1.16b, %[zero].16b, #4\n"  \
+  "ext v10.16b, v2.16b, %[zero].16b, #4\n" \
+  "ext v11.16b, v3.16b, %[zero].16b, #4\n" \
+                                           \
+  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"      \
+  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"      \
+                                           \
+  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"      \
+  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"      \
+                                           \
+  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"      \
+  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"      \
+                                           \
+  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"      \
+  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"      \
+                                           \
+  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"      \
+  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"      \
+                                           \
+  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"      \
+  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"      \
+                                           \
+  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"      \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"      \
+                                           \
+  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"      \
+  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"     \
+                                           \
+  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"     \
+  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"     \
+                                           \
+  "fadd v12.4s, v12.4s, v14.4s\n"          \
+  "fadd v12.4s, v12.4s, v16.4s\n"          \
+                                           \
+  "fadd v13.4s, v13.4s, v15.4s\n"          \
+  "fadd v13.4s, v13.4s, v17.4s\n"          \
+                                           \
+  "fadd v12.4s, v12.4s, %[bias].4s\n"      \
+  "fadd v13.4s, v13.4s, %[bias].4s\n"
+
+#define RESULT_S_S1             \
+  "prfm pldl1keep, [%[out1]]\n" \
+  "prfm pldl1keep, [%[out2]]\n" \
+                                \
+  "st1 {v12.4s}, [%[out1]]\n"   \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU              \
+  "prfm pldl1keep, [%[out1]]\n"       \
+  "prfm pldl1keep, [%[out2]]\n"       \
+                                      \
+  "fmax v12.4s, v12.4s, %[zero].4s\n" \
+  "fmax v13.4s, v13.4s, %[zero].4s\n" \
+                                      \
+  "st1 {v12.4s}, [%[out1]]\n"         \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                   \
+  "prfm pldl1keep, [%[din0]]\n"                           \
+  "prfm pldl1keep, [%[din1]]\n"                           \
+  "prfm pldl1keep, [%[din2]]\n"                           \
+  "prfm pldl1keep, [%[din3]]\n"                           \
+                                                          \
+  "ld1 {v0.4s, v1.4s}, [%[din0]]\n"                       \
+  "ld1 {v2.4s, v3.4s}, [%[din1]]\n"                       \
+  "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
+  "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
+                                                          \
+  "bif v0.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v1.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v2.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v3.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v4.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v5.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v6.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v7.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
+  "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
+                                                          \
+  "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"          \
+  "and  v13.16b, %[vbias].16b, %[vbias].16b  \n" /* r0 */ \
+  "fmul v10.4s, v0.4s, %[wr0].s[0]\n"                     \
+  "fmul v11.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr0].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v2.16b, v3.16b, #4\n"                      \
+  "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */             \
+  "fmul v14.4s, v2.4s, %[wr0].s[0]\n"                     \
+  "fmla v10.4s, v2.4s, %[wr1].s[0]\n"                     \
+                                                          \
+  "fmul v15.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr1].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr1].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v4.16b, v5.16b, #4\n"                      \
+  "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */             \
+  "fmla v14.4s, v4.4s, %[wr1].s[0]\n"                     \
+  "fmla v10.4s, v4.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr1].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr1].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v6.16b, v7.16b, #4\n"                      \
+  "ext v9.16b, v6.16b, v7.16b, #8\n"                      \
+                                                          \
+  "fmla v14.4s, v6.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v10.4s\n"                         \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v11.4s\n"                         \
+  "fadd v13.4s, v13.4s, v14.4s\n"                         \
+  "fadd v13.4s, v13.4s, v15.4s\n"  // \
+                    // "prfm pldl1keep, [%[out1]]\n" \
+                    // "prfm pldl1keep, [%[out2]]\n" \
+                    // \
+                    // "st1 {v12.4s}, [%[out1]]\n" \
+                    // "st1 {v13.4s}, [%[out2]]\n" \
+
+
+#else
+#define INIT_S1                                                    \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                      @ preload data\n"        \
+  "pld [%[din2_ptr]]                      @ preload data\n"        \
+  "pld [%[din3_ptr]]                      @ preload data\n"        \
+                                                                   \
+  "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"          \
+  "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"          \
+  "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"          \
+                                                                   \
+  "vdup.32 q4, %[bias_val]                            @ and \n"    \
+  "vdup.32 q5, %[bias_val]                            @ and \n"
+
+#define LEFT_COMPUTE_S1                                            \
+  "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"                    \
+  "vext.32  q7, q8, q9, #1     @ 1234\n" /* r0 */                  \
+  "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"           \
+                                                                   \
+  "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+                                                                   \
+  "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                             @ preload data\n" \
+  "pld [%[din2_ptr]]                             @ preload data\n" \
+  "pld [%[din3_ptr]]                             @ preload data\n" \
+                                                                   \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"                   \
+  "vext.32  q7, q10, q11, #1     @ 1234\n"                         \
+                                                                   \
+  /* r1 */                                                         \
+  "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"               \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"                   \
+  "vext.32  q7, q12, q13, #1     @ 1234\n"                         \
+                                                                   \
+  /* r2 */                                                         \
+  "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"                   \
+  "vext.32  q7, q14, q15, #1     @ 1234\n"
+
+#define LEFT_RESULT_S1                                                        \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_COMPUTE_S1                                                 \
+  "1:                                    @ right pad entry\n" /* r0 */ \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                       \
+  "pld [%[din0_ptr]]                             @ preload data\n"     \
+  "pld [%[din1_ptr]]                             @ preload data\n"     \
+  "pld [%[din2_ptr]]                             @ preload data\n"     \
+  "pld [%[din3_ptr]]                             @ preload data\n"     \
+                                                                       \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"               \
+                                                                       \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                             \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                    \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                             \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                    \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                             \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define MID_RESULT_S1                                                    \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_COMPUTE_S1                                                      \
+  "3:                                    @ right pad entry\n"                 \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"                     \
+  "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"                     \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define RIGHT_RESULT_S1                                                 \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define LEFT_RESULT_S1_RELU                                                   \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_RELU                                               \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_RESULT_S1_RELU                                            \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define COMPUTE_S_S1                 \
+  "pld [%[din0]]\n"                  \
+  "pld [%[din1]]\n"                  \
+  "pld [%[din2]]\n"                  \
+  "pld [%[din3]]\n"                  \
+                                     \
+  "vld1.32 {d12-d13}, [%[din0]]!\n"  \
+  "vld1.32 {d14-d15}, [%[din1]]!\n"  \
+  "vld1.32 {d16-d17}, [%[din2]]!\n"  \
+  "vld1.32 {d18-d19}, [%[din3]]!\n"  \
+                                     \
+  "vbif q6, %q[vzero], %q[mask]\n"   \
+  "vbif q7, %q[vzero], %q[mask]\n"   \
+  "vbif q8, %q[vzero], %q[mask]\n"   \
+  "vbif q9, %q[vzero], %q[mask]\n"   \
+                                     \
+  "vmul.f32 q14, q6, %e[wr0][1]\n"   \
+  "vmul.f32 q15, q7, %e[wr0][1]\n"   \
+                                     \
+  "vmla.f32 q14, q7, %e[wr1][1]\n"   \
+  "vmla.f32 q15, q8, %e[wr1][1]\n"   \
+                                     \
+  "vmla.f32 q14, q8, %e[wr2][1]\n"   \
+  "vmla.f32 q15, q9, %e[wr2][1]\n"   \
+                                     \
+  "vext.32 q10, %q[vzero], q6, #3\n" \
+  "vext.32 q11, %q[vzero], q7, #3\n" \
+  "vext.32 q12, %q[vzero], q8, #3\n" \
+  "vext.32 q13, %q[vzero], q9, #3\n" \
+                                     \
+  "vmla.f32 q14, q10, %e[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %e[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %e[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %e[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %e[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %e[wr2][0]\n"  \
+                                     \
+  "vext.32 q10, q6, %q[vzero], #1\n" \
+  "vext.32 q11, q7, %q[vzero], #1\n" \
+  "vext.32 q12, q8, %q[vzero], #1\n" \
+  "vext.32 q13, q9, %q[vzero], #1\n" \
+                                     \
+  "vmla.f32 q14, q10, %f[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %f[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %f[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %f[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %f[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %f[wr2][0]\n"  \
+                                     \
+  "vadd.f32 q14, q14, %q[bias]\n"    \
+  "vadd.f32 q15, q15, %q[bias]\n"
+
+#define RESULT_S_S1                \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU           \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vmax.f32 q14, q14, %q[vzero]\n" \
+  "vmax.f32 q15, q15, %q[vzero]\n" \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                                       \
+  "pld [%[din0]]\n"                                                           \
+  "pld [%[din1]]\n"                                                           \
+  "pld [%[din2]]\n"                                                           \
+  "pld [%[din3]]\n"                                                           \
+  "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"                          \
+  "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"                          \
+  "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"                          \
+  "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"                          \
+                                                                              \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n" /* r3 */                           \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vadd.f32 q4, q4, q10         @ q4 += q10 \n"                               \
+                                                                              \
+  "pld [%[out1]]\n"                                                           \
+  "pld [%[out2]]\n"                                                           \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                      \
+  "vadd.f32 q14, q4, q11         @ q4 += q10 \n"                              \
+                                                                              \
+  "vadd.f32 q5, q5, q8         @ q4 += q10 \n"                                \
+  "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
+
+#endif
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p1_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = (w_in + 3) >> 2;
+  int cnt_col = tile_w - 2;
+
+  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 > h_in) {
+          switch (i + 5 - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = cnt_col;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        } else {
+          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                       : [cnt] "+r"(cnt),
+                         [din_ptr0] "+r"(din_ptr0),
+                         [din_ptr1] "+r"(din_ptr1),
+                         [din_ptr2] "+r"(din_ptr2),
+                         [din_ptr3] "+r"(din_ptr3),
+                         [din_ptr4] "+r"(din_ptr4),
+                         [din_ptr5] "+r"(din_ptr5),
+                         [doutr0] "+r"(doutr0),
+                         [doutr1] "+r"(doutr1),
+                         [doutr2] "+r"(doutr2),
+                         [doutr3] "+r"(doutr3)
+                       : [w0] "w"(wr0),
+                         [w1] "w"(wr1),
+                         [w2] "w"(wr2),
+                         [bias_val] "r"(vbias),
+                         [vmask] "r"(vmask),
+                         [rmask] "r"(rmask),
+                         [vzero] "w"(vzero)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "v23",
+                         "v24",
+                         "v25");
+        }
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+        // unsigned int* rst_mask = rmask;
+
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr3;
+          dr3 = dr2 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+          dr2 = dr1 + w_in;
+          dr3 = dr2 + w_in;
+        }
+        //! process bottom pad
+        if (i + 3 > h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = cnt_col;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+              : [dout_ptr1] "+r"(doutr0),
+                [dout_ptr2] "+r"(doutr1),
+                [din0_ptr] "+r"(din_ptr0),
+                [din1_ptr] "+r"(din_ptr1),
+                [din2_ptr] "+r"(din_ptr2),
+                [din3_ptr] "+r"(din_ptr3),
+                [cnt] "+r"(cnt),
+                [rmask] "+r"(rmask_ptr),
+                [vmask] "+r"(vmask_ptr)
+              : [wr0] "w"(wr0),
+                [wr1] "w"(wr1),
+                [wr2] "w"(wr2),
+                [bias_val] "r"(bias_val),
+                [vzero] "w"(vzero)
+              : "cc",
+                "memory",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+        } else {
+          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p1_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[4] = {3, 2, 1, 0};
+  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+
+      int hs = -1;
+      int he = 3;
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      int h_cnt = (h_out + 1) >> 1;
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_cnt; ++j) {
+        const float *dr0 = din_channel + hs * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        if (hs == -1) {
+          dr0 = zero;
+        }
+
+        switch (he - h_in) {
+          case 2:
+            dr2 = zero;
+            doutr1 = trash_buf;
+          case 1:
+            dr3 = zero;
+          default:
+            break;
+        }
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [zero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17");
+        } else {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [zero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+        hs += 2;
+        he += 2;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p0_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+
+  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
+  const int remian_idx[4] = {0, 1, 2, 3};
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = tile_w;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              MID_COMPUTE_S1 MID_RESULT_S1_RELU
+              "cmp  %w[remain], #1             \n"
+              "blt 0f                         \n" RIGHT_COMPUTE_S1
+                  RIGHT_RESULT_S1_RELU "0: \n"
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero),
+                [remain] "r"(remain)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        } else {
+          asm volatile(
+              INIT_S1
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              MID_COMPUTE_S1 MID_RESULT_S1
+              "cmp  %w[remain], #1             \n"
+              "blt 0f                         \n" RIGHT_COMPUTE_S1
+                  RIGHT_RESULT_S1 "0: \n"
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero),
+                [remain] "r"(remain)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        }
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 3 >= h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            case 0:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        if (flag_relu) {
+          asm volatile(INIT_S1
+                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "vext.32  q6, q8, q9, #1     @ 0012\n"
+                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                           MID_RESULT_S1_RELU
+                       "cmp  %[remain], #1             \n"
+                       "blt 0f                         \n" RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1_RELU "0:                         \n"
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero),
+                         [remain] "r"(remain)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(INIT_S1
+                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "vext.32  q6, q8, q9, #1     @ 0012\n"
+                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                           MID_RESULT_S1
+                       "cmp  %[remain], #1             \n"
+                       "blt 0f                         \n" RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1 "0:                         \n"
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero),
+                         [remain] "r"(remain)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p0_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp1 =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
+  uint32x4_t vmask_rp2 =
+      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#endif  // __aarch64__
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0 = din_channel + j * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        doutr0 = dout_channel + j * w_out;
+        doutr1 = doutr0 + w_out;
+
+        if (j + 3 >= h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              dr1 = zero_ptr;
+            case 2:
+              dr2 = zero_ptr;
+            case 1:
+              dr3 = zero_ptr;
+              doutr1 = trash_buf;
+            case 0:
+              dr3 = zero_ptr;
+              doutr1 = trash_buf;
+            default:
+              break;
+          }
+        }
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vbias] "w"(wbias),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [zero] "w"(vzero),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        } else {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vbias] "w"(wbias),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [zero] "w"(vzero),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        }
+#else
+        unsigned int *vmask_ptr = vmask;
+        float bias_val = flag_bias ? bias[i] : 0.f;
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [bias_val] "r"(bias_val),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [bias_val] "r"(bias_val),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08e5efecd751bcca534ba7a47035c5f70fa1f6bf
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -0,0 +1,541 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void conv_3x3s1_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  int threads = ctx->threads();
+
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+
+  const int out_c_block = 4;
+  const int out_h_kernel = 2;
+  const int out_w_kernel = 4;
+  const int win_ext = ow + 2;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh + 2;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_relu = param.fuse_relu;
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  float* ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+      float32x4_t vbias = vld1q_f32(bias_local);
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc00 = dout_c00 + h * ow;
+        float* outc01 = outc00 + ow;
+        float* outc10 = outc00 + size_out_channel;
+        float* outc11 = outc10 + ow;
+        float* outc20 = outc10 + size_out_channel;
+        float* outc21 = outc20 + ow;
+        float* outc30 = outc20 + size_out_channel;
+        float* outc31 = outc30 + ow;
+        const float* inr0 = pre_din + h * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc10 = ptr_write;
+              outc11 = ptr_write;
+            case 2:
+              outc20 = ptr_write;
+              outc21 = ptr_write;
+            case 1:
+              outc30 = ptr_write;
+              outc31 = ptr_write;
+            default:
+              break;
+          }
+        }
+        if (h + out_h_kernel > oh) {
+          outc01 = ptr_write;
+          outc11 = ptr_write;
+          outc21 = ptr_write;
+          outc31 = ptr_write;
+        }
+        float* outl[] = {outc00,
+                         outc10,
+                         outc20,
+                         outc30,
+                         outc01,
+                         outc11,
+                         outc21,
+                         outc31,
+                         reinterpret_cast<float*>(bias_local),
+                         reinterpret_cast<float*>(flag_relu)};
+        void* outl_ptr = reinterpret_cast<void*>(outl);
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          float* out0 = pre_out;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/
+          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/
+          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/
+          /*  r0, r1, mul w0, get out r0, r1 */
+          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/
+          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/
+          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/
+          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/
+          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/
+          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/
+          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/
+          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/
+          /*  r0, r1, mul w1, get out r0, r1 */
+          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/
+          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/
+          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/
+          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/
+          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/
+          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/
+          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/
+          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/
+          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/
+          /*  r0, r1, mul w2, get out r0, r1 */
+          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/
+          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/
+          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/
+          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/
+          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/
+          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/
+          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/
+          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/
+          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/
+          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/
+          /*  r1, r2, mul w3, get out r0, r1 */
+          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/
+          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/
+          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/
+          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/
+          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/
+          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/
+          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/
+          /*  r1, r2, mul w4, get out r0, r1 */
+          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/
+          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/
+          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/
+          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/
+          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/
+          "ldp    x0, x1, [%[outl]]  \n"
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/
+          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/
+          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/
+          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/
+          /*  r1, r2, mul w5, get out r0, r1 */
+          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/
+          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/
+          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/
+          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/
+          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/
+          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/
+          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/
+          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/
+          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/
+          /*  r2, r3, mul w6, get out r0, r1 */
+          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/
+          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/
+          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/
+          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/
+          "ldp    x2, x3, [%[outl], #16]  \n"
+          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/
+          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/
+          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/
+          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/
+          /*  r2, r3, mul w7, get out r0, r1 */
+          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/
+          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/
+          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/
+          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/
+          "ldp    x4, x5, [%[outl], #32]  \n"
+          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/
+          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/
+          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/
+          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/
+          /*  r2, r3, mul w8, get out r0, r1 */
+          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/
+          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/
+          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/
+          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/
+          "ldp    x6, x7, [%[outl], #48]  \n"
+          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/
+          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/
+          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/
+          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/
+
+          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */
+
+          /* transpose */
+          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/
+          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/
+          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/
+          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/
+          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/
+          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/
+          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/
+          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/
+          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
+          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
+          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
+          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
+          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/
+          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/
+          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/
+          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
+
+          "cbz    %w[flag_relu],  0f\n"    /* skip relu*/
+          "movi   v0.4s, #0\n"             /* for relu */
+          "fmax   v15.4s, v15.4s, v0.4s\n"
+          "fmax   v16.4s, v16.4s, v0.4s\n"
+          "fmax   v17.4s, v17.4s, v0.4s\n"
+          "fmax   v18.4s, v18.4s, v0.4s\n"
+          "fmax   v19.4s, v19.4s, v0.4s\n"
+          "fmax   v20.4s, v20.4s, v0.4s\n"
+          "fmax   v21.4s, v21.4s, v0.4s\n"
+          "fmax   v22.4s, v22.4s, v0.4s\n"
+          "0:\n"
+          "cbnz   %w[flag_mask], 1f\n"
+          "str    q15, [x0]\n" /* save outc00 */
+          "str    q16, [x4]\n" /* save outc01 */
+          "str    q17, [x1]\n" /* save outc10 */
+          "str    q18, [x5]\n" /* save outc11 */
+          "str    q19, [x2]\n" /* save outc20 */
+          "str    q20, [x6]\n" /* save outc21 */
+          "str    q21, [x3]\n" /* save outc30 */
+          "str    q22, [x7]\n" /* save outc31 */
+          "b 2f\n"
+          "1:\n"
+          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */
+          "2:\n"
+          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
+           [inr2] "+r"(inr2), [inr3] "+r"(inr3),
+           [out]"+r"(out0)
+          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
+           [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
+           [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
+           [vbias]"w" (vbias), [outl] "r" (outl_ptr),
+           [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu)
+          : "cc", "memory",
+            "v0","v1","v2","v3","v4","v5","v6","v7",
+            "v8", "v9", "v10", "v11", "v15",
+            "v16","v17","v18","v19","v20","v21","v22",
+            "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"
+          );
+#else
+          asm volatile(
+          /* load weights */
+          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n"
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n"
+          /* load r0, r1 */
+          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n"
+          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n"
+          /* main loop */
+          "0:                                   @ main loop\n"
+          /* mul r0 with w0, w1, w2, get out r0 */
+          "vmul.f32   q8, q5, q0                @ w0 * inr00\n"
+          "vmul.f32   q9, q5, q1                @ w0 * inr01\n"
+          "vmul.f32   q10, q5, q2               @ w0 * inr02\n"
+          "vmul.f32   q11, q5, q3               @ w0 * inr03\n"
+          "vmla.f32   q8, q6, q1                @ w1 * inr01\n"
+          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n"
+          "vmla.f32   q9, q6, q2                @ w1 * inr02\n"
+          "vmla.f32   q10, q6, q3               @ w1 * inr03\n"
+          "vmla.f32   q11, q6, q0               @ w1 * inr04\n"
+          "vmla.f32   q8, q7, q2                @ w2 * inr02\n"
+          "vmla.f32   q9, q7, q3                @ w2 * inr03\n"
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n"
+          "vmla.f32   q10, q7, q0               @ w2 * inr04\n"
+          "vmla.f32   q11, q7, q1               @ w2 * inr05\n"
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n"
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n"
+          /* mul r1 with w0-w5, get out r0, r1 */
+          "vmul.f32   q12, q5, q2               @ w0 * inr10\n"
+          "vmul.f32   q13, q5, q3               @ w0 * inr11\n"
+          "vmul.f32   q14, q5, q0               @ w0 * inr12\n"
+          "vmul.f32   q15, q5, q1               @ w0 * inr13\n"
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n"
+          "vmla.f32   q8, q4, q2                @ w3 * inr10\n"
+          "vmla.f32   q9, q4, q3                @ w3 * inr11\n"
+          "vmla.f32   q10, q4, q0               @ w3 * inr12\n"
+          "vmla.f32   q11, q4, q1               @ w3 * inr13\n"
+          /* mul r1 with w1, w4, get out r1, r0 */
+          "vmla.f32   q8, q5, q3                @ w4 * inr11\n"
+          "vmla.f32   q12, q6, q3               @ w1 * inr11\n"
+          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n"
+          "vmla.f32   q9, q5, q0                @ w4 * inr12\n"
+          "vmla.f32   q13, q6, q0               @ w1 * inr12\n"
+          "vmla.f32   q10, q5, q1               @ w4 * inr13\n"
+          "vmla.f32   q14, q6, q1               @ w1 * inr13\n"
+          "vmla.f32   q11, q5, q2               @ w4 * inr14\n"
+          "vmla.f32   q15, q6, q2               @ w1 * inr14\n"
+          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n"
+          /* mul r1 with w2, w5, get out r1, r0 */
+          "vmla.f32   q12, q7, q0               @ w2 * inr12\n"
+          "vmla.f32   q13, q7, q1               @ w2 * inr13\n"
+          "vmla.f32   q8, q6, q0                @ w5 * inr12\n"
+          "vmla.f32   q9, q6, q1                @ w5 * inr13\n"
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n"
+          "vmla.f32   q14, q7, q2               @ w2 * inr14\n"
+          "vmla.f32   q15, q7, q3               @ w2 * inr15\n"
+          "vmla.f32   q10, q6, q2               @ w5 * inr14\n"
+          "vmla.f32   q11, q6, q3               @ w5 * inr15\n"
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n"
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n"
+          /* mul r2 with w3-w8, get out r0, r1 */
+          "vmla.f32   q12, q4, q0               @ w3 * inr20\n"
+          "vmla.f32   q13, q4, q1               @ w3 * inr21\n"
+          "vmla.f32   q14, q4, q2               @ w3 * inr22\n"
+          "vmla.f32   q15, q4, q3               @ w3 * inr23\n"
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n"
+          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n"
+          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n"
+          "vmla.f32   q10, q7, q2               @ w6 * inr22\n"
+          "vmla.f32   q11, q7, q3               @ w6 * inr23\n"
+          /* mul r2 with w4, w7, get out r1, r0 */
+          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n"
+          "vmla.f32   q12, q5, q1               @ w4 * inr21\n"
+          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n"
+          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n"
+          "vmla.f32   q13, q5, q2               @ w4 * inr22\n"
+          "vmla.f32   q10, q4, q3               @ w7 * inr23\n"
+          "vmla.f32   q14, q5, q3               @ w4 * inr23\n"
+          "vmla.f32   q11, q4, q0               @ w7 * inr24\n"
+          "vmla.f32   q15, q5, q0               @ w4 * inr24\n"
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n"
+          /* mul r1 with w5, w8, get out r1, r0 */
+          "vmla.f32   q12, q6, q2               @ w5 * inr22\n"
+          "vmla.f32   q13, q6, q3               @ w5 * inr23\n"
+          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n"
+          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n"
+          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n"
+          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n"
+          "vmla.f32   q14, q6, q0               @ w5 * inr24\n"
+          "vmla.f32   q15, q6, q1               @ w5 * inr25\n"
+          "vmla.f32   q10, q5, q0               @ w8 * inr24\n"
+          "vmla.f32   q11, q5, q1               @ w8 * inr25\n"
+          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n"
+          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n"
+          /* mul r3 with w6, w7, w8, get out r1 */
+          "vmla.f32   q12, q7, q2               @ w6 * inr30\n"
+          "vmla.f32   q13, q7, q3               @ w6 * inr31\n"
+          "vmla.f32   q14, q7, q0               @ w6 * inr32\n"
+          "vmla.f32   q15, q7, q1               @ w6 * inr33\n"
+          "vmla.f32   q12, q4, q3               @ w7 * inr31\n"
+          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n"
+          "vld1.32    {d12-d13}, [r4]           @ load bias\n"
+          "vmla.f32   q13, q4, q0               @ w7 * inr32\n"
+          "vmla.f32   q14, q4, q1               @ w7 * inr33\n"
+          "vmla.f32   q15, q4, q2               @ w7 * inr34\n"
+          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n"
+          "vmla.f32   q12, q5, q0               @ w8 * inr32\n"
+          "vmla.f32   q13, q5, q1               @ w8 * inr33\n"
+          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n"
+          "vmla.f32   q14, q5, q2               @ w8 * inr34\n"
+          "vmla.f32   q15, q5, q3               @ w8 * inr35\n"
+          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n"
+          "vadd.f32   q8, q8, q6                @ r00 add bias\n"
+          "vadd.f32   q9, q9, q6                @ r01 add bias\n"
+          "vadd.f32   q10, q10, q6              @ r02 add bias\n"
+          "vadd.f32   q11, q11, q6              @ r03 add bias\n"
+          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n"
+          "vadd.f32   q12, q12, q6              @ r10 add bias\n"
+          "vadd.f32   q13, q13, q6              @ r11 add bias\n"
+          "vadd.f32   q14, q14, q6              @ r12 add bias\n"
+          "vadd.f32   q15, q15, q6              @ r13 add bias\n"
+          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n"
+          "vmov.u32   q7, #0                    @ mov zero to q7\n"
+          "cmp  r5, #0                          @ cmp flag relu\n"
+          "beq  1f                              @ skip relu\n"
+          "vmax.f32  q8, q8, q7                 @ r00 relu\n"
+          "vmax.f32  q9, q9, q7                 @ r01 relu\n"
+          "vmax.f32  q10, q10, q7               @ r02 relu\n"
+          "vmax.f32  q11, q11, q7               @ r03 relu\n"
+          "vmax.f32  q12, q12, q7               @ r10 relu\n"
+          "vmax.f32  q13, q13, q7               @ r11 relu\n"
+          "vmax.f32  q14, q14, q7               @ r12 relu\n"
+          "vmax.f32  q15, q15, q7               @ r13 relu\n"
+          "1:\n"
+          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n"
+          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n"
+          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n"
+          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n"
+          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n"
+          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n"
+          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n"
+          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n"
+          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n"
+          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n"
+          "cmp %[flag_mask], #0       @ cmp flag mask\n"
+          "bne 2f\n"
+          "vst1.32   {d16-d17}, [r0]  @ save outc00\n"
+          "vst1.32   {d18-d19}, [r1]  @ save outc10\n"
+          "vst1.32   {d20-d21}, [r2]  @ save outc20\n"
+          "vst1.32   {d22-d23}, [r3]  @ save outc30\n"
+          "vst1.32   {d24-d25}, [r4]  @ save outc01\n"
+          "vst1.32   {d26-d27}, [r5]  @ save outc11\n"
+          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n"
+          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n"
+          "vst1.32   {d28-d29}, [r0]  @ save outc21\n"
+          "vst1.32   {d30-d31}, [r1]  @ save outc31\n"
+          "b 3f                       @ branch end\n"
+          "2: \n"
+          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n"
+          "3: \n"
+          : [r0] "+r"(inr0), [r1] "+r"(inr1),
+            [r2] "+r"(inr2), [r3] "+r"(inr3),
+            [out0] "+r"(out0), [wc0] "+r"(weight_c)
+          : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr)
+          : "cc", "memory",
+            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+            "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5"
+          );
+#endif  //  __arch64__
+          // clang-format on
+          outl[0] += 4;
+          outl[1] += 4;
+          outl[2] += 4;
+          outl[3] += 4;
+          outl[4] += 4;
+          outl[5] += 4;
+          outl[6] += 4;
+          outl[7] += 4;
+          if (flag_mask) {
+            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
+            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
+            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
+            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
+            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
+            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
+            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
+            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
index 8260718a50f8e2fa8497d41d958e82a45ea0480d..807135f57dfadf690277ab57bd5597e9470ae549 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param,
                                        ARMContext* ctx) {
   auto dim_in = param.x->dims();
   auto dim_out = param.output->dims();
+  auto paddings = *param.paddings;
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   int ow = dim_out[3];
   int oh = dim_out[2];
   int ic = dim_in[1];
@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data,
   //! 3x3s2 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
   const int win_round = wout_round * 2 /*stride_w*/ + 1;
   bool flag_relu = param.fuse_relu;
diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
index 01b7a812ebc05a054bb9952bf53605ce7aed135a..26829544bfd34d7acfc1d49086e86c3e0edad5f1 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
   //! 3x3s2 int8 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
 
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / 4;
@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
   //! 3x3s2 int8 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
   const int threads = ctx->threads();
   //! set 1/4 l2 cache
   int llc_size = ctx->llc_size() / 4;
diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..455781e37e0747950e6740f6db45c1ce8c0e96c8
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -0,0 +1,1862 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void conv_depthwise_3x3s2p0_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx);
+
+void conv_depthwise_3x3s2p0_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx);
+
+void conv_depthwise_3x3s2_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float* weights,
+                               const float* bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx) {
+  if (pad == 0) {
+    if (w_in > 7) {
+      conv_depthwise_3x3s2p0_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s2p0_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+  if (pad == 1) {
+    if (w_in > 7) {
+      conv_depthwise_3x3s2p1_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s2p1_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+}
+#ifdef __aarch64__
+#define INIT_S2                                  \
+  "prfm pldl1keep, [%[inptr0]]             \n"   \
+  "prfm pldl1keep, [%[inptr1]]             \n"   \
+  "prfm pldl1keep, [%[inptr2]]             \n"   \
+  "prfm pldl1keep, [%[inptr3]]             \n"   \
+  "prfm pldl1keep, [%[inptr4]]             \n"   \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"  \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"  \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"  \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"  \
+                                                 \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n" \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"
+
+#define LEFT_COMPUTE_S2                                                   \
+  "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n" /* r0 */               \
+  "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   /*  {0,2,4,6} * w01 */ \
+  "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   /* {1,3,5,7} * w02 */  \
+  "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  /* {0,1,3,5} * w00*/   \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n" /* v10 = {0,1,3,5} */  \
+                                                                          \
+  "sub %[inptr0], %[inptr0], #4            \n"                            \
+  "sub %[inptr1], %[inptr1], #4             \n" /* r1 */                  \
+  "fmla v11.4s, v2.4s, %[w1].s[1]            \n"                          \
+  "fmla v12.4s, v3.4s, %[w1].s[2]            \n"                          \
+  "fmla v16.4s, v10.4s, %[w1].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"                        \
+                                                                          \
+  "sub %[inptr2], %[inptr2], #4            \n"                            \
+  "sub %[inptr3], %[inptr3], #4             \n" /* r2 */                  \
+  "fmul v13.4s, v4.4s, %[w0].s[1]            \n"                          \
+  "fmla v11.4s, v4.4s, %[w2].s[1]            \n"                          \
+                                                                          \
+  "fmul v14.4s, v5.4s, %[w0].s[2]            \n"                          \
+  "fmla v12.4s, v5.4s, %[w2].s[2]            \n"                          \
+                                                                          \
+  "fmla v17.4s, v10.4s, %[w0].s[0]            \n"                         \
+  "fmla v16.4s, v10.4s, %[w2].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"                        \
+                                                                          \
+  "sub %[inptr4], %[inptr4], #4            \n" /* r3 */                   \
+  "fmla v13.4s, v6.4s, %[w1].s[1]            \n"                          \
+  "fmla v14.4s, v7.4s, %[w1].s[2]            \n"                          \
+  "fmla v17.4s, v10.4s, %[w1].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"                        \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"                       \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"
+
+#define LEFT_RESULT_S2                              \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define MID_COMPUTE_S2                                      \
+  "2:                                          \n" /* r0 */ \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"            \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"            \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */    \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"            \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"            \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                \
+                                                            \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */    \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"            \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"            \
+                                                            \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"            \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"            \
+                                                            \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"           \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                \
+                                                            \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */    \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"            \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"            \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                \
+                                                            \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"             \
+                                                            \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"         \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"
+
+#define MID_RESULT_S2                               \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
+                                                    \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+  "subs %w[cnt], %w[cnt], #1                    \n" \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "bne  2b                                    \n"
+
+#define RIGHT_COMPUTE_S2                                   \
+  "1:                                          \n"         \
+  "cmp %w[remain], #1                           \n"        \
+  "blt 4f                                     \n"          \
+  "3:                                         \n"          \
+  "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"          \
+                                                           \
+  "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n" /* r0 */ \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"           \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"           \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"          \
+  "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n" /* r1 */ \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"           \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"           \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n" /* r2 */ \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"           \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"           \
+                                                           \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"           \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"           \
+                                                           \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"          \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n" /* r3 */ \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"           \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"           \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"          \
+  "ld1 {v0.4s}, [%[outptr0]]                  \n"          \
+                                                           \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"        \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"        \
+  "ld1 {v1.4s}, [%[outptr1]]                  \n"
+
+#define RIGHT_RESULT_S2                             \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
+                                                    \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"        \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"        \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+  "4:                                          \n"
+
+#define LEFT_RESULT_S2_RELU                         \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
+                                                    \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define MID_RESULT_S2_RELU                                    \
+  /* r4 */                                                    \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+                                                              \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"               \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"              \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"              \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"              \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"              \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"              \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"                  \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"              \
+  "subs %w[cnt], %w[cnt], #1                    \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+                                                              \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"              \
+                                                              \
+  "bne  2b                                    \n"
+
+#define RIGHT_RESULT_S2_RELU                                  \
+  /* r4 */                                                    \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+                                                              \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+  "4:                                          \n"
+
+#define COMPUTE_S_S2                                  \
+  "movi v9.4s, #0                                 \n" \
+  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n" \
+                                                      \
+  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n" \
+  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n" \
+  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n" \
+                                                      \
+  "bif v10.16b, v9.16b, v6.16b                    \n" \
+  "bif v11.16b, v9.16b, v7.16b                    \n" \
+  "bif v12.16b, v9.16b, v6.16b                    \n" \
+  "bif v13.16b, v9.16b, v7.16b                    \n" \
+  "bif v14.16b, v9.16b, v6.16b                    \n" \
+  "bif v15.16b, v9.16b, v7.16b                    \n" \
+                                                      \
+  "ext v6.16b, v9.16b, v11.16b, #12               \n" \
+  "ext v7.16b, v9.16b, v13.16b, #12               \n" \
+  "ext v8.16b, v9.16b, v15.16b, #12               \n" \
+                                                      \
+  "fmul v4.4s, v10.4s, %[wr0].s[1]                \n" \
+  "fmul v5.4s, v11.4s, %[wr0].s[2]                \n" \
+  "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n" \
+                                                      \
+  "fmla v4.4s, v12.4s, %[wr1].s[1]                \n" \
+  "fmla v5.4s, v13.4s, %[wr1].s[2]                \n" \
+  "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n" \
+                                                      \
+  "fmla v4.4s, v14.4s, %[wr2].s[1]                \n" \
+  "fmla v5.4s, v15.4s, %[wr2].s[2]                \n" \
+  "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n" \
+                                                      \
+  "fadd v4.4s, v4.4s, v5.4s                       \n" \
+  "fadd v4.4s, v4.4s, v6.4s                       \n"
+
+#define RESULT_S_S2                                   \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#define RESULT_S_S2_RELU                              \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#define COMPUTE_S_S2_P0                                \
+  "movi v9.4s, #0                                 \n"  \
+  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
+                                                       \
+  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  \
+  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  \
+  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  \
+  "and  v4.16b, %[bias].16b, %[bias].16b  \n"          \
+                                                       \
+  "bif v10.16b, v9.16b, v6.16b                    \n"  \
+  "bif v11.16b, v9.16b, v7.16b                    \n"  \
+  "bif v12.16b, v9.16b, v6.16b                    \n"  \
+  "bif v13.16b, v9.16b, v7.16b                    \n"  \
+  "bif v14.16b, v9.16b, v6.16b                    \n"  \
+  "bif v15.16b, v9.16b, v7.16b                    \n"  \
+                                                       \
+  "ext v6.16b, v10.16b, v9.16b, #4               \n"   \
+  "ext v7.16b, v12.16b, v9.16b, #4               \n"   \
+  "ext v8.16b, v14.16b, v9.16b, #4               \n"   \
+                                                       \
+  "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"  \
+  "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"  \
+  "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n" \
+                                                       \
+  "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"  \
+  "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"  \
+  "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n" \
+                                                       \
+  "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"  \
+  "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"  \
+  "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n" \
+                                                       \
+  "fadd v4.4s, v4.4s, v5.4s                       \n"  \
+  "fadd v4.4s, v4.4s, v16.4s                       \n"
+
+#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]]                          \n"
+
+#define RESULT_S_S2_P0_RELU                           \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#else
+#define INIT_S2                                                     \
+  "vmov.u32 q9, #0                                \n"               \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  \
+  "pld [%[din0_ptr]]                              @ preload data\n" \
+  "pld [%[din1_ptr]]                              @ preload data\n" \
+  "pld [%[din2_ptr]]                              @ preload data\n" \
+                                                                    \
+  "vdup.32 q3, %[bias]                            @ and \n"
+
+#define LEFT_COMPUTE_S2                                                   \
+  "vext.32 q6, q9, q11, #3                        @ shift right 1 data\n" \
+  "vext.32 q7, q9, q13, #3                        @ shift right 1 data\n" \
+  "vext.32 q8, q9, q15, #3                        @ shift right 1 data\n" \
+  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, out0\n" \
+  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"        \
+  "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"        \
+  "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"        \
+                                                                          \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
+                                                                          \
+  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, out1\n" \
+  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, out1\n" \
+  "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, out1\n" \
+                                                                          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vadd.f32 q3, q3, q4                            @ add \n"               \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define LEFT_RESULT_S2                                \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "cmp %[cnt], #1                                 \n" \
+  "blt 1f                                         \n"
+
+#define MID_COMPUTE_S2                                                    \
+  "2:                                             \n"                     \
+  "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"        \
+  "vdup.32  q3, %[bias]                           @ and \n"               \
+  "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"      \
+  "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"        \
+                                                                          \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n" \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n" \
+  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n" \
+                                                                          \
+  "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"      \
+  "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"        \
+                                                                          \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
+                                                                          \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"      \
+                                                                          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n" \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n" \
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n" \
+                                                                          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"        \
+                                                                          \
+  "vadd.f32 q3, q3, q4                            @ add \n"               \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define MID_RESULT_S2                                 \
+  "subs %[cnt], #1                                \n" \
+                                                      \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "bne  2b                                        \n"
+
+#define RIGHT_COMPUTE_S2                                                    \
+  "1:                                             \n"                       \
+  "cmp %[remain], #1                              \n"                       \
+  "blt 3f                                         \n"                       \
+                                                                            \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"        \
+  "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"          \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"            \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RIGHT_RESULT_S2                                           \
+  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+                                                                  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "3:                                             \n"
+
+#define LEFT_RESULT_S2_RELU                           \
+  "vmax.f32 q3, q3, q9                    @ relu \n"  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "cmp %[cnt], #1                                 \n" \
+  "blt 1f                                         \n"
+
+#define MID_RESULT_S2_RELU                            \
+  "vmax.f32 q3, q3, q9                    @ relu \n"  \
+  "subs %[cnt], #1                                \n" \
+                                                      \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "bne  2b                                        \n"
+
+#define RIGHT_RESULT_S2_RELU                                      \
+  "vmax.f32 q3, q3, q9                    @ relu \n"              \
+  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+                                                                  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "3:                                             \n"
+
+#define COMPUTE_S_S2                                                        \
+  "vmov.u32 q9, #0                                \n"                       \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"        \
+  "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"        \
+  "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define RESULT_S_S2_RELU                                    \
+  "vmax.f32 q3, q3, q9                            @ relu\n" \
+                                                            \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define COMPUTE_S_S2_P0                                                     \
+  "vmov.u32 q9, #0                                \n"                       \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"             \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define RESULT_S_S2_P0_RELU                                  \
+  "vmax.f32 q3, q3, q9                            @ relu \n" \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#endif
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ * w_in > 7
+ */
+void conv_depthwise_3x3s2p1_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  int size_pad_bottom = h_out * 2 - h_in;
+
+  int cnt_col = (w_out >> 2) - 2;
+  int size_right_remain = w_in - (7 + cnt_col * 8);
+  if (size_right_remain >= 9) {
+    cnt_col++;
+    size_right_remain -= 8;
+  }
+  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
+
+  int size_right_pad = w_out * 2 - w_in;
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          din3_ptr = dr2;
+          din4_ptr = dr3;
+          dr0 = dr3;
+          dr1 = dr4;
+        } else {
+          dr0 = dr4;
+          dr1 = dr0 + w_in;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i / 2 + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = cnt_col;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        } else {
+          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                       : [inptr0] "+r"(din0_ptr),
+                         [inptr1] "+r"(din1_ptr),
+                         [inptr2] "+r"(din2_ptr),
+                         [inptr3] "+r"(din3_ptr),
+                         [inptr4] "+r"(din4_ptr),
+                         [outptr0] "+r"(doutr0_ptr),
+                         [outptr1] "+r"(doutr1_ptr),
+                         [cnt] "+r"(cnt)
+                       : [vzero] "w"(vzero),
+                         [w0] "w"(wr0),
+                         [w1] "w"(wr1),
+                         [w2] "w"(wr2),
+                         [remain] "r"(cnt_remain),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [wmask] "w"(wmask),
+                         [vbias] "w"(wbias)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21");
+        }
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr1 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr0 + w_in;
+          dr2 = dr1 + w_in;
+        }
+
+        //! process bottom pad
+        if (i + 2 > h_in) {
+          switch (i + 2 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = cnt_col;
+        unsigned int* mask_ptr = dmask;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+              : [din0_ptr] "+r"(din0_ptr),
+                [din1_ptr] "+r"(din1_ptr),
+                [din2_ptr] "+r"(din2_ptr),
+                [outptr] "+r"(doutr0_ptr),
+                [cnt] "+r"(cnt),
+                [mask_ptr] "+r"(mask_ptr)
+              : [remain] "r"(cnt_remain),
+                [wr0] "w"(wr0),
+                [wr1] "w"(wr1),
+                [wr2] "w"(wr2),
+                [bias] "r"(bias_c)
+              : "cc",
+                "memory",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+        } else {
+          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p1_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      int hs = -1;
+      int he = 2;
+      float out_buf[4];
+      for (int j = 0; j < h_out; ++j) {
+        const float* dr0 = din_channel + hs * w_in;
+        const float* dr1 = dr0 + w_in;
+        const float* dr2 = dr1 + w_in;
+        if (hs == -1) {
+          dr0 = zeros;
+        }
+        if (he > h_in) {
+          dr2 = zeros;
+        }
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        } else {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+        hs += 2;
+        he += 2;
+      }
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ */
+// w_in > 7
+void conv_depthwise_3x3s2p0_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+
+  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 5 > h_in) {
+          switch (i * 2 + 5 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            case 0:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = tile_w;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2
+              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+              MID_COMPUTE_S2 MID_RESULT_S2_RELU
+              "cmp %w[remain], #1                           \n"
+              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                  RIGHT_RESULT_S2_RELU
+              "4:                                          \n"
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        } else {
+          asm volatile(
+              INIT_S2
+              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+              MID_COMPUTE_S2 MID_RESULT_S2
+              "cmp %w[remain], #1                           \n"
+              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                  RIGHT_RESULT_S2
+              "4:                                          \n"
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        }
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i++) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 3 > h_in) {
+          switch (i * 2 + 3 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = tile_w;
+        unsigned int* mask_ptr = dmask;
+        if (flag_relu) {
+          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
+                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
+                           RIGHT_RESULT_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p0_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      float out_buf[4];
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      for (int j = 0; j < h_out; j++) {
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+        if (j * 2 + 2 >= h_in) {
+          switch (j + 2 - h_in) {
+            case 1:
+              din1_ptr = zero_ptr;
+            case 0:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16");
+        } else {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf),
+                         [mask_ptr] "r"(dmask)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf),
+                         [mask_ptr] "r"(dmask)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9852c0f84eae8451ef795c95faddfc88e833bea8
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
@@ -0,0 +1,362 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_3x3s2_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  auto paddings = *param.paddings;
+  int threads = ctx->threads();
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+  const int out_c_block = 4;
+  const int out_h_kernel = 1;
+  const int out_w_kernel = 4;
+  const int win_ext = ow * 2 + 1;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh * 2 + 1;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_relu = param.fuse_relu;
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  auto ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc0 = dout_c00 + h * ow;
+        float* outc1 = outc0 + size_out_channel;
+        float* outc2 = outc1 + size_out_channel;
+        float* outc3 = outc2 + size_out_channel;
+        const float* inr0 = pre_din + h * 2 * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc1 = ptr_write;
+            case 2:
+              outc2 = ptr_write;
+            case 1:
+              outc3 = ptr_write;
+            default:
+              break;
+          }
+        }
+        auto c0 = outc0;
+        auto c1 = outc1;
+        auto c2 = outc2;
+        auto c3 = outc3;
+        float pre_out[16];
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          if (flag_mask) {
+            c0 = outc0;
+            c1 = outc1;
+            c2 = outc2;
+            c3 = outc3;
+            outc0 = pre_out;
+            outc1 = pre_out + 4;
+            outc2 = pre_out + 8;
+            outc3 = pre_out + 12;
+          }
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+          "ldr    q8, [%[bias]]\n"         /* load bias */
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v19.16b,  v8.16b, v8.16b\n"
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v20.16b,  v8.16b, v8.16b\n"
+          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v21.16b,  v8.16b, v8.16b\n"
+          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v22.16b,  v8.16b, v8.16b\n"
+          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
+          /*  r0 mul w0-w2, get out */
+          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
+          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
+          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
+          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
+          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
+          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
+          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
+          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
+          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
+          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
+          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
+          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
+          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
+          /*  r1, mul w3-w5, get out */
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
+          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
+          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
+          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
+          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
+          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
+          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
+          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
+          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
+          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
+          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
+          /*  r2, mul w6-w8, get out r0, r1 */
+          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
+          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
+          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
+          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
+          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
+          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
+          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
+          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
+          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
+          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
+          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
+          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
+          /* transpose */
+          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
+          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
+          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
+          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
+          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
+          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
+          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
+          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
+          /* relu */
+          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
+          "movi v0.4s, #0\n"             /* for relu */
+          "fmax v19.4s, v19.4s, v0.4s\n"
+          "fmax v20.4s, v20.4s, v0.4s\n"
+          "fmax v21.4s, v21.4s, v0.4s\n"
+          "fmax v22.4s, v22.4s, v0.4s\n"
+          /* save result */
+          "0:\n"
+          "str q19, [%[outc0]], #16\n"
+          "str q20, [%[outc1]], #16\n"
+          "str q21, [%[outc2]], #16\n"
+          "str q22, [%[outc3]], #16\n"
+          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
+          [inr2] "+r"(inr2),
+          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
+          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
+          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
+          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
+          : "cc", "memory",
+                  "v0","v1","v2","v3","v4","v5","v6","v7",
+                  "v8", "v19","v20","v21","v22"
+          );
+#else
+          asm volatile(
+          /* fill with bias */
+          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
+          /* load weights */
+          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
+          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
+          "vand.i32 q12,  q8, q8\n"
+          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
+          "vand.i32 q13,  q8, q8\n"
+          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
+          "vand.i32 q14,  q8, q8\n"
+          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
+          "vand.i32 q15,  q8, q8\n"
+          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
+          /* mul r0 with w0, w1, w2 */
+          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
+          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
+          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
+          /* mul r1 with w3, w4, w5 */
+          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
+          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
+          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
+          /* mul r2 with w6, w7, w8 */
+          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
+          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
+          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
+          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
+          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
+          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
+          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
+          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
+          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
+          /* transpose */
+          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
+          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
+          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
+          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
+          "cmp  %[flag_relu], #0\n"
+          "beq  0f\n"             /* skip relu*/
+          "vmov.u32 q0, #0\n"
+          "vmax.f32 q12, q12, q0\n"
+          "vmax.f32 q13, q13, q0\n"
+          "vmax.f32 q14, q14, q0\n"
+          "vmax.f32 q15, q15, q0\n"
+          "0:\n"
+          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
+          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
+          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
+          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+          :[r0] "+r"(inr0), [r1] "+r"(inr1),
+           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
+           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[bias] "r" (bias_local),
+           [flag_relu]"r"(flag_relu)
+          :"cc", "memory",
+            "q0","q1","q2","q3","q4","q5","q6","q7",
+            "q8", "q9","q10","q11","q12","q13","q14","q15"
+          );
+#endif  //  __arch64__
+          // clang-format off
+          if (flag_mask) {
+            for (int i = 0; i < remain; ++i) {
+              c0[i] = pre_out[i];
+              c1[i] = pre_out[i + 4];
+              c2[i] = pre_out[i + 8];
+              c3[i] = pre_out[i + 12];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index b2d16d18d2300ea51de8c8e9f25664ffdf4aebc7..e4279d9a728bc7af0f14a00b781db449fc426582 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -254,6 +254,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
     LOG(FATAL) << "prepack_dw_input, valid height must > zero";
   }
   float32x4_t vzero = vdupq_n_f32(0.f);
+  auto out_data = dout;
 
   int size_w = we - ws;
   int w0 = ws < 0 ? 0 : ws;
@@ -269,6 +270,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
 
   bool flag_ext_l = left_remain > 0;
   int left_sl = 4 - left_remain;
+  int left_valid_sl = left_sl > width ? width : left_sl;
   uint32x4_t vmask_padl;
   bool flag_mask_l = false;
   if (flag_ext_l) {
@@ -290,6 +292,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
   }
   int size_c = width * height;
   for (int h = hs; h < he; ++h) {
+    dout = out_data + (h - hs) * 4 * size_w;
     auto ptr_c0 = din + cs * size_c + h * width;
     auto ptr_c1 = ptr_c0 + size_c;
     auto ptr_c2 = ptr_c1 + size_c;
@@ -351,10 +354,10 @@ inline void prepack_input_nxwc4_dw(const float* din,
       }
       transpose_4x4(vc0, vc1, vc2, vc3, dout);
       dout += 16;
-      ptr_c0 += left_sl;
-      ptr_c1 += left_sl;
-      ptr_c2 += left_sl;
-      ptr_c3 += left_sl;
+      ptr_c0 += left_valid_sl;
+      ptr_c1 += left_valid_sl;
+      ptr_c2 += left_valid_sl;
+      ptr_c3 += left_valid_sl;
     }
     /// valid
     for (int i = 0; i < cnt_valid; ++i) {
@@ -722,7 +725,57 @@ inline bool write_to_output_c1_fp32(const float* din,
   }
   return true;
 }
-
+#ifdef __aarch64__
+#define NCHWC2_TRANS_FP32_COMPUTE                                      \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "movi v20.4s, #0                \n" /* for relu */                   \
+  "1:                             \n" /* main loop*/                   \
+  "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/                \
+  "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/                \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/               \
+  "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
+
+#define NCHWC2_TRANS_FP32_RELU                 \
+  "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
+
+#define NCHWC2_TRANS_FP32_STORE                          \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+                                                         \
+  "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/    \
+  "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/    \
+                                                         \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC2_TRANS_FP32_COMPUTE                                      \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, " \
+  "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"                       \
+  "vmov.u32 q15, #0                       @ dump zero\n"               \
+  "1:                                     @ main loop\n"               \
+  "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "   \
+  "c1r0, c1r1 \n"                                                      \
+  "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "   \
+  "c1r2, c1r3 \n"                                                      \
+                                                                       \
+  "vswp  d1, d2                           @ swap data\n"
+
+#define NCHWC2_TRANS_FP32_RELU                      \
+  "vmax.f32   q0, q0, q15                 @ relu\n" \
+  "vmax.f32   q1, q1, q15                 @ relu\n"
+
+#define NCHWC2_TRANS_FP32_STORE                                 \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add " \
+  "pointer\n"                                                   \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add " \
+  "pointer\n"                                                   \
+                                                                \
+  "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"   \
+                                                                \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"       \
+                                                                \
+  "bne    1b                              @ jump to main loop\n"
+#endif
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -777,127 +830,41 @@ inline bool write_to_output_c2_fp32(const float* din,
       int cnt_loop = cnt;
       if (flag_relu) {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/
-            "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vmax.f32   q0, q0, q15                 @ relu\n"
-            "vmax.f32   q1, q1, q15                 @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
 #endif
       } else {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q4, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q5, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0", "v1", "v2", "v3", "v4", "v5");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
 #endif
       }
     }
@@ -922,6 +889,70 @@ inline bool write_to_output_c2_fp32(const float* din,
   return true;
 }
 
+#ifdef __aarch64__
+#define NCHWC4_TRANS_FP32_COMPUTE                                   \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */ \
+  "movi v20.4s, #0                \n" /* for relu */                \
+  "1:                             \n" /* main loop*/                \
+  "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/             \
+  "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/             \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
+  "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/             \
+  "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/             \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */ \
+  "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/            \
+  "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/            \
+  "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/            \
+  "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
+
+#define NCHWC4_TRANS_FP32_RELU                 \
+  "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
+
+#define NCHWC4_TRANS_FP32_STORE                          \
+  "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
+  "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
+  "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/    \
+  "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC4_TRANS_FP32_COMPUTE                                     \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"      \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"              \
+  "vmov.u32 q15, #0                       @ dump zero\n"              \
+  "1:                                     @ main loop\n"              \
+  "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 " \
+  "\n"                                                                \
+  "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 " \
+  "\n"                                                                \
+                                                                      \
+  "vswp   d1, d4                          @ swap data\n"              \
+  "vswp   d3, d6                          @ swap data\n"
+
+#define NCHWC4_TRANS_FP32_RELU             \
+  "vmax.f32   q0, q0, q15        @ relu\n" \
+  "vmax.f32   q1, q1, q15        @ relu\n" \
+  "vmax.f32   q2, q2, q15        @ relu\n" \
+  "vmax.f32   q3, q3, q15        @ relu\n"
+
+#define NCHWC4_TRANS_FP32_STORE                                        \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n" \
+                                                                       \
+  "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"                    \
+                                                                       \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"                \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"                \
+                                                                       \
+  "bne    1b                            @ jump to main loop\n"
+#endif
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -958,7 +989,9 @@ inline bool write_to_output_c4_fp32(const float* din,
 
   int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
 
-  int cnt = (width - ws) / w4;
+  int valid_we = we > width ? width : we;
+  int cnt = (valid_we - ws) / w4;
+  int remain = valid_we - ws - cnt * w4;
 
   for (int i = 0; i < size_h; i++) {
     int size_w = i * width;
@@ -983,185 +1016,88 @@ inline bool write_to_output_c4_fp32(const float* din,
       int cnt_loop = cnt;
       if (flag_relu) {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vmax.f32   q0, q0, q15        @ relu\n"
-            "vmax.f32   q1, q1, q15        @ relu\n"
-            "vmax.f32   q2, q2, q15        @ relu\n"
-            "vmax.f32   q3, q3, q15        @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
 #endif
       } else {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v16",
-              "v17",
-              "v18",
-              "v19");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3");
 #endif
       }
     }
-    if (we > width) {
+    if (remain > 0) {
       int offset = i * w_round * c4 + c4 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
+      int j = 0;
       if (flag_relu) {
-        for (; j < width; ++j) {
+        for (; j < remain; ++j) {
           *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
           *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
           *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
@@ -1169,7 +1105,7 @@ inline bool write_to_output_c4_fp32(const float* din,
           din_hei_ptr += w4;
         }
       } else {
-        for (; j < width; ++j) {
+        for (; j < remain; ++j) {
           *(doutc0_ptr++) = din_hei_ptr[0];
           *(doutc1_ptr++) = din_hei_ptr[1];
           *(doutc2_ptr++) = din_hei_ptr[2];
@@ -1182,6 +1118,120 @@ inline bool write_to_output_c4_fp32(const float* din,
   return true;
 }
 
+#ifdef __aarch64__
+#define NCHWC8_TRANS_FP32_COMPUTE                                    \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+  "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+  "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+  "movi v20.4s, #0                \n" /* for relu */                 \
+  "1:                             \n" /* main loop*/                 \
+  "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/              \
+  "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/              \
+  "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/              \
+  "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/              \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+                                                                     \
+  "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/              \
+  "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/              \
+  "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/              \
+  "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/              \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+                                                                     \
+  "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/ \
+  "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/ \
+  "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/ \
+  "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/ \
+  "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+                                                                     \
+  "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/ \
+  "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/ \
+  "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/ \
+  "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/ \
+  "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+
+#define NCHWC8_TRANS_FP32_RELU                 \
+  "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/ \
+                                               \
+  "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/ \
+  "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/ \
+  "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
+
+#define NCHWC8_TRANS_FP32_STORE                          \
+  "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
+  "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
+  "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/    \
+  "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+  "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/    \
+  "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/    \
+  "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/    \
+  "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC8_TRANS_FP32_COMPUTE                           \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"     \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"     \
+  "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"     \
+  "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"     \
+  "vmov.u32 q15, #0                      @ dump zero\n"     \
+  "1:                                    @ main loop\n"     \
+  "vtrn.32   q0, q2                      @ trans q0, q2 \n" \
+  "vtrn.32   q4, q6                      @ trans q4, q6 \n" \
+  "vswp.32   d1, d8                      @ swap  d1, d8 \n" \
+  "vswp.32   d5, d12                     @ swap  d5, d12\n" \
+                                                            \
+  "vtrn.32   q1, q3                      @ trans q1, q3 \n" \
+  "vtrn.32   q5, q7                      @ trans q5, q7 \n" \
+  "vswp.32   d3, d10                     @ swap  d3, d10\n" \
+  "vswp.32   d7, d14                     @ swap  d7, d14\n"
+
+#define NCHWC8_TRANS_FP32_RELU                     \
+  "vmax.f32  q0, q0, q15                 @ relu\n" \
+  "vmax.f32  q1, q1, q15                 @ relu\n" \
+  "vmax.f32  q2, q2, q15                 @ relu\n" \
+  "vmax.f32  q3, q3, q15                 @ relu\n" \
+                                                   \
+  "vmax.f32  q4, q4, q15                 @ relu\n" \
+  "vmax.f32  q5, q5, q15                 @ relu\n" \
+  "vmax.f32  q6, q6, q15                 @ relu\n" \
+  "vmax.f32  q7, q7, q15                 @ relu\n"
+
+#define NCHWC8_TRANS_FP32_STORE                                \
+  "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"   \
+  "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+                                                               \
+  "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"        \
+  "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"        \
+                                                               \
+  "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+                                                               \
+  "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"        \
+  "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"        \
+                                                               \
+  "bne    1b                             @ jump to main loop\n"
+
+#endif
 /*wirte result in outputs
 * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
 */
@@ -1261,158 +1311,54 @@ inline bool write_to_output_c8_fp32(const float* din,
       if (cnt > 0) {
         int cnt_loop = cnt;
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-
-            "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/
-            "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/
-            "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/
-            "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-            "vmov.u32 q15, #0                      @ dump zero\n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "vmax.f32  q0, q0, q15                 @ relu\n"
-            "vmax.f32  q1, q1, q15                 @ relu\n"
-            "vmax.f32  q2, q2, q15                 @ relu\n"
-            "vmax.f32  q3, q3, q15                 @ relu\n"
-
-            "vmax.f32  q4, q4, q15                 @ relu\n"
-            "vmax.f32  q5, q5, q15                 @ relu\n"
-            "vmax.f32  q6, q6, q15                 @ relu\n"
-            "vmax.f32  q7, q7, q15                 @ relu\n"
-
-            "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"
-            "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q4", "q15");
 #endif
       }
       if (we > width) {
@@ -1468,138 +1414,53 @@ inline bool write_to_output_c8_fp32(const float* din,
       if (cnt > 0) {
         int cnt_loop = cnt;
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
 #else
-        asm volatile(
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d8-d11}, [%[ptr_din]]!     @load data \n"
-            "vld1.32   {d12-d15}, [%[ptr_din]]!    @load data \n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "subs      %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-            "vst1.32   {d0-d1},   [%[doutc0r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3},   [%[doutc4r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5},   [%[doutc1r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7},   [%[doutc5r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3},   [%[ptr_din]]!    @load data \n"
-            "vld1.32   {d4-d7},   [%[ptr_din]]!    @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11},  [%[ptr_din]]!      @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q4");
 #endif
       }
       if (we > width) {
diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h
index 1a23982cd575afb6b249390de7081165c03414b9..b6c3478880d5cb59999d23ff03e2e342708ca95b 100644
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -85,38 +85,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                bool flag_relu,
                                ARMContext* ctx);
 
-void conv_depthwise_3x3p0_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
-void conv_depthwise_3x3p1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
 template <typename Dtype>
 void conv_depthwise_3x3s1_int8(Dtype* dout,
                                const int8_t* din,
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
index 010563bf936c2f8454162c8aad48cd8815c5f7af..dc68e65f42a799d7fa7e8be75f5afcf3166b1df3 100644
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -107,29 +107,35 @@ void im2col(const Dtype* data_im,
             int width,
             int kernel_h,
             int kernel_w,
-            int pad_h,
-            int pad_w,
+            int pad_top,
+            int pad_bottom,
+            int pad_left,
+            int pad_right,
             int stride_h,
             int stride_w,
             int dilation_h,
             int dilation_w,
             Dtype* data_col) {
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) /
+          stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_top + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             for (int output_cols = output_w; output_cols; output_cols--) {
               *(data_col++) = 0;
             }
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_left + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 *(data_col++) = data_im[input_row * width + input_col];
@@ -202,7 +208,8 @@ void conv1x1s1_gemm(const float* i_data,
               k,
               flag_bias,
               bias_group,
-              flag_relu);
+              flag_relu,
+              ctx);
       } else {
         sgemm_prepack(false,
                       m,
@@ -361,6 +368,8 @@ void conv_im2col_gemm(const float* i_data,
   float* tmp_work_space =
       ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   //! use gemv when the output channel size = 1
   for (int b = 0; b < num; ++b) {
     // dC
@@ -378,12 +387,14 @@ void conv_im2col_gemm(const float* i_data,
              win,
              kernel_h,
              kernel_w,
-             param.paddings[0],
-             param.paddings[1],
+             paddings[0],
+             paddings[1],
+             paddings[2],
+             paddings[3],
              param.strides[0],
              param.strides[1],
-             param.dilations[0],
-             param.dilations[1],
+             dilations[0],
+             dilations[1],
              dB);
 
       if (n == 1) {
@@ -395,7 +406,8 @@ void conv_im2col_gemm(const float* i_data,
               k,
               flag_bias,
               bias_group,
-              flag_relu);
+              flag_relu,
+              ctx);
       } else {
         int ldb = n;
         sgemm_prepack(false,
@@ -434,14 +446,16 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
                            const float* scale) {
   int group = param.groups;
   auto filter_dims = param.filter->dims();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int kernel_h = filter_dims[2];
   int kernel_w = filter_dims[3];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   const int m = oc / group;
   const int n = oh * ow;
   const int k = ic * kernel_h * kernel_w / group;
@@ -482,7 +496,9 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
              kernel_h,
              kernel_w,
              pad_h,
+             paddings[1],
              pad_w,
+             paddings[3],
              stride_h,
              stride_w,
              dila_h,
@@ -562,90 +578,83 @@ void conv_depthwise_3x3_fp32(const void* din,
                              const operators::ConvParam& param,
                              ARMContext* ctx,
                              const float* scale) {
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  if (pad_w != pad_h) {
-    LOG(FATAL) << "fp32 depthwise conv3x3 pad_w: " << pad_w
-               << ", pad_h: " << pad_h << " must be equal";
-    return;
-  }
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   int stride = param.strides[1];
   int pad = pad_w;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
-  if (stride == 1 && pad < 2) {  // support pad = [0, 1]
-    conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else if (stride == 2 && pad < 2) {  // support pad = [0, 1]
-    conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else {
-    LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride
-               << " or pad(<2): " << pad << " unsupported";
-  }
-#if 0
-  if (pad == 1) {
-    conv_depthwise_3x3p1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              stride,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else if (pad == 0 && h_in > 2) {
-    conv_depthwise_3x3p0_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              stride,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+  if (stride == 1) {
+    if (pads_equal && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
+      conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                pad,
+                                flag_bias,
+                                flag_relu,
+                                ctx);
+    } else {
+      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                ctx);
+    }
+
+  } else if (stride == 2) {
+    if (pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
+      conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                pad,
+                                flag_bias,
+                                flag_relu,
+                                ctx);
+    } else {
+      conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                ctx);
+    }
   } else {
-    LOG(FATAL) << "unsupport this type 3x3 dw conv";
+    LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride << " unsupported";
   }
-#endif
 }
 
 void conv_depthwise_5x5_fp32(const void* din,
@@ -662,7 +671,8 @@ void conv_depthwise_5x5_fp32(const void* din,
                              const operators::ConvParam& param,
                              ARMContext* ctx,
                              const float* scale) {
-  int pad = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad = paddings[0];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -718,8 +728,9 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -776,8 +787,9 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -834,8 +846,9 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -875,8 +888,9 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
index c5baa31e1414c4a7a0c926728e5c150c0fc3e21c..f4d00039aaa635d0ffb31846fd9ff9077ac0c621 100644
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor,
                     const int* bias,
                     int channel,
                     int channel_size);
+// new winograd
 
+void weight_trans_c4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc
index 87b08f63102104b325e95c093fe0fc0aaef243e0..894b946a32ccb7c487087291894a01a1d79334fa 100644
--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ b/lite/backends/arm/math/conv_winograd_3x3.cc
@@ -37,9 +37,9 @@ void conv_winograd3x3(const float* din,
                       const operators::ConvParam& param,
                       ARMContext* ctx) {
   int threads = ctx->threads();
-
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[1];
   int size_in_channel = win * hin;
   int size_out_channel = wout * hout;
   bool flag_relu = param.fuse_relu;
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index d8ef6ff47d0392ac15caf2d94b7c53ff63659da2..8977b5712c13dec0088d83db4cbfef8494785301 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -39,10 +39,12 @@
 #include "lite/backends/arm/math/im2sequence.h"
 #include "lite/backends/arm/math/increment.h"
 #include "lite/backends/arm/math/interpolate.h"
+#include "lite/backends/arm/math/layout.h"
 #include "lite/backends/arm/math/lrn.h"
 #include "lite/backends/arm/math/negative.h"
 #include "lite/backends/arm/math/norm.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
 #include "lite/backends/arm/math/pad2d.h"
 #include "lite/backends/arm/math/pooling.h"
 #include "lite/backends/arm/math/power.h"
diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc
index f89410ad11590c60bf5542702b60fa883298d3e6..e9e18043dfc09001ebba23f952a59474630e54aa 100644
--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -22,6 +22,28 @@ namespace lite {
 namespace arm {
 namespace math {
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 // The following function bilinear_interp is partially base on
 // https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp
 // Tencent is pleased to support the open source community by making ncnn
@@ -472,33 +494,52 @@ void nearest_interp(const float* src,
 
 void interpolate(lite::Tensor* X,
                  lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
                  lite::Tensor* Out,
                  int out_height,
                  int out_width,
-                 float height_scale,
-                 float width_scale,
+                 float scale,
                  bool with_align,
                  std::string interpolate_type) {
+  int in_h = X->dims()[2];
+  int in_w = X->dims()[3];
+  if (SizeTensor.size() > 0) {
+    auto new_size = get_new_shape(SizeTensor);
+    out_height = new_size[0];
+    out_width = new_size[1];
+  } else {
+    auto scale_tensor = Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_height = static_cast<int>(in_h * scale);
+      out_width = static_cast<int>(in_w * scale);
+    }
+    auto out_size = OutSize;
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<float>(out_size);
+      out_height = static_cast<int>(out_size_data[0]);
+      out_width = static_cast<int>(out_size_data[1]);
+    }
+  }
+  float height_scale = scale;
+  float width_scale = scale;
   if (out_width > 0 && out_height > 0) {
     height_scale = static_cast<float>(out_height / X->dims()[2]);
     width_scale = static_cast<float>(out_width / X->dims()[3]);
   }
-  if (OutSize != nullptr) {
-    auto OutSize_data = OutSize->data<int>();
-    int h_out = OutSize_data[0];  // HW
-    int w_out = OutSize_data[1];  // HW
-    int num_cout = Out->dims()[0];
-    int c_cout = Out->dims()[1];
-    Out->Resize({num_cout, c_cout, h_out, w_out});
-  }
+  int num_cout = X->dims()[0];
+  int c_cout = X->dims()[1];
+  Out->Resize({num_cout, c_cout, out_height, out_width});
 
   float* dout = Out->mutable_data<float>();
   const float* din = X->data<float>();
   int out_num = Out->dims()[0];
   int out_c = Out->dims()[1];
   int count = out_num * out_c;
-  int in_h = X->dims()[2];
-  int in_w = X->dims()[3];
   int out_h = Out->dims()[2];
   int out_w = Out->dims()[3];
   int spatial_in = in_h * in_w;
diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h
index be250f6a5e7581ba70809362d169167fea1d1c11..e9c41c5bc86c8f00d57e096e3cd2b5f37df3a474 100644
--- a/lite/backends/arm/math/interpolate.h
+++ b/lite/backends/arm/math/interpolate.h
@@ -44,11 +44,12 @@ void nearest_interp(const float* src,
 
 void interpolate(lite::Tensor* X,
                  lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
                  lite::Tensor* Out,
                  int out_height,
                  int out_width,
-                 float height_scale,
-                 float width_scale,
+                 float scale,
                  bool with_align,
                  std::string interpolate_type);
 
diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd9126ab48c8f829c82d0c78a338074c695f0b9c
--- /dev/null
+++ b/lite/backends/arm/math/layout.cc
@@ -0,0 +1,668 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/layout.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+#ifdef __aarch64__
+#define TRANS_C4                                                \
+  "ld1 {v0.4s}, [%[din0_ptr]]   \n"                             \
+  "ld1 {v1.4s}, [%[din1_ptr]]   \n"                             \
+  "ld1 {v2.4s}, [%[din2_ptr]]   \n"                             \
+  "ld1 {v3.4s}, [%[din3_ptr]]   \n"                             \
+                                                                \
+  "1: \n"                                                       \
+  "trn1 v4.4s, v0.4s, v1.4s \n" /*00 10 02 12 */                \
+  "trn1 v5.4s, v2.4s, v3.4s \n" /*20 30 22 32 */                \
+  "trn2 v6.4s, v0.4s, v1.4s \n" /*01 11 03 13 */                \
+  "trn2 v7.4s, v2.4s, v3.4s \n" /*21 31 23 33 */                \
+                                                                \
+  "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \
+                                                                \
+  "trn1 v8.2d, v4.2d, v5.2d \n"  /*00 10 20 30 */               \
+  "trn1 v9.2d, v6.2d, v7.2d \n"  /*01 11 21 31 */               \
+  "trn2 v10.2d, v4.2d, v5.2d \n" /*02 12 22 32 */               \
+  "trn2 v11.2d, v6.2d, v7.2d \n" /*03 13 23 33 */               \
+                                                                \
+  "ld1 {v0.4s}, [%[din0_ptr]]   \n"                             \
+  "ld1 {v1.4s}, [%[din1_ptr]]   \n"                             \
+  "ld1 {v2.4s}, [%[din2_ptr]]   \n"                             \
+  "ld1 {v3.4s}, [%[din3_ptr]]   \n"                             \
+                                                                \
+  "subs %w[cnt], %w[cnt], #1 \n"                                \
+  "str q8, [%[out0_ptr]], #16 \n"                               \
+  "str q9, [%[out1_ptr]], #16 \n"                               \
+  "str q10, [%[out2_ptr]], #16 \n"                              \
+  "str q11, [%[out3_ptr]], #16 \n"                              \
+  "bne 1b \n"
+
+#define TRANS_C8                                                  \
+  "1: \n"                                                         \
+  "ld1 {v0.8b}, [%[din0_ptr]]   \n"                               \
+  "ld1 {v1.8b}, [%[din1_ptr]]   \n"                               \
+  "ld1 {v2.8b}, [%[din2_ptr]]   \n"                               \
+  "ld1 {v3.8b}, [%[din3_ptr]]   \n"                               \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "trn1 v8.8b, v0.8b, v1.8b \n"  /*00 10 02 12 04 14 06 16 */     \
+  "trn1 v9.8b, v2.8b, v3.8b \n"  /*20 30 22 32 */                 \
+  "trn2 v12.8b, v0.8b, v1.8b \n" /*01 11 03 13 05 15 07 17 */     \
+  "trn2 v13.8b, v2.8b, v3.8b \n" /*21 31 23 33 */                 \
+                                                                  \
+  "ld1 {v4.8b}, [%[din0_ptr]]   \n"                               \
+  "ld1 {v5.8b}, [%[din1_ptr]]   \n"                               \
+  "ld1 {v6.8b}, [%[din2_ptr]]   \n"                               \
+  "ld1 {v7.8b}, [%[din3_ptr]]   \n"                               \
+                                                                  \
+  "trn1 v10.8b, v4.8b, v5.8b \n" /*40 50 42 52 */                 \
+  "trn1 v11.8b, v6.8b, v7.8b \n" /*60 70 62 72 */                 \
+  "trn2 v14.8b, v4.8b, v5.8b \n" /*41 51 43 53 */                 \
+  "trn2 v15.8b, v6.8b, v7.8b \n" /*61 71 63 73 */                 \
+                                                                  \
+  "trn1 v0.4h, v8.4h, v9.4h \n"   /*00 10 20 30 04 14 24 34*/     \
+  "trn1 v2.4h, v12.4h, v13.4h \n" /*01 11 21 31 05 15 25 35*/     \
+  "trn1 v1.4h, v10.4h, v11.4h \n" /*40 50 60 70 44 54 64 74*/     \
+  "trn1 v3.4h, v14.4h, v15.4h \n" /*41 51 61 71 45 55 65 75*/     \
+                                                                  \
+  "trn2 v4.4h, v8.4h, v9.4h \n"   /*02 10 20 30 06 14 24 34*/     \
+  "trn2 v6.4h, v12.4h, v13.4h \n" /*03 11 21 31 07 15 25 35*/     \
+  "trn2 v5.4h, v10.4h, v11.4h \n" /*42 50 60 70 46 54 64 74*/     \
+  "trn2 v7.4h, v14.4h, v15.4h \n" /*43 51 61 71 47 55 65 75*/     \
+                                                                  \
+  "trn1 v8.2s, v0.2s, v1.2s \n"  /*00 10 20 30 40 50 60 70*/      \
+  "trn1 v9.2s, v2.2s, v3.2s \n"  /*01 11 21 31 41 51 61 71*/      \
+  "trn1 v10.2s, v4.2s, v5.2s \n" /*02 12 22 32 42 50 60 70*/      \
+  "trn1 v11.2s, v6.2s, v7.2s \n" /*03 13 23 33 41 51 61 71*/      \
+                                                                  \
+  "trn2 v12.2s, v0.2s, v1.2s \n" /*04 14 24 34 44 54 64 74*/      \
+  "trn2 v13.2s, v2.2s, v3.2s \n" /*05 15 25 35  45 55 65 75*/     \
+  "trn2 v14.2s, v4.2s, v5.2s \n" /*06 16 22 32 42 50 60 70*/      \
+  "trn2 v15.2s, v6.2s, v7.2s \n" /*07 17 23 33 41 51 61 71*/      \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "subs %w[cnt], %w[cnt], #1 \n"                                  \
+  "st1 {v8.8b}, [%[out0_ptr]], #8 \n"                             \
+  "st1 {v9.8b}, [%[out1_ptr]], #8 \n"                             \
+  "st1 {v10.8b}, [%[out2_ptr]], #8 \n"                            \
+  "st1 {v11.8b}, [%[out3_ptr]], #8 \n"                            \
+                                                                  \
+  "st1 {v11.8b}, [%[out4_ptr]], #8 \n"                            \
+  "st1 {v12.8b}, [%[out5_ptr]], #8 \n"                            \
+  "st1 {v13.8b}, [%[out6_ptr]], #8 \n"                            \
+  "st1 {v14.8b}, [%[out7_ptr]], #8 \n"                            \
+  "bne 1b \n"
+
+#else
+#define TRANS_C4                                                \
+  "1: \n"                                                       \
+  "vld1.32 {d0-d1}, [%[din0_ptr]] \n"                           \
+  "vld1.32 {d2-d3}, [%[din1_ptr]] \n"                           \
+  "vld1.32 {d4-d5}, [%[din2_ptr]] \n"                           \
+  "vld1.32 {d6-d7}, [%[din3_ptr]] \n"                           \
+                                                                \
+  "vtrn.32 q0, q1 \n" /*00 10 02 12 01 11 03 13*/               \
+  "vtrn.32 q2, q3 \n" /*20 30 22 32 21 31 23 33 */              \
+                                                                \
+  "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \
+  "vswp d1, d4 \n"                                              \
+  "vswp d3, d6 \n"                                              \
+                                                                \
+  "subs %[cnt], %[cnt], #1 \n"                                  \
+  "vst1.32  {d0-d1}, [%[out0_ptr]]! \n"                         \
+  "vst1.32  {d2-d3}, [%[out1_ptr]]! \n"                         \
+  "vst1.32  {d4-d5}, [%[out2_ptr]]! \n"                         \
+  "vst1.32  {d6-d7}, [%[out3_ptr]]! \n"                         \
+  "bne 1b \n"
+
+#define TRANS_C8                                                  \
+  "1: \n"                                                         \
+  "vld1.8 d0, [%[din0_ptr]] \n"                                   \
+  "vld1.8 d1, [%[din1_ptr]] \n"                                   \
+  "vld1.8 d2, [%[din2_ptr]] \n"                                   \
+  "vld1.8 d3, [%[din3_ptr]] \n"                                   \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "vtrn.8 d0, d1 \n" /*00 10 02 12 04 14 06 16*/                  \
+  "vtrn.8 d2, d3 \n" /*20 30 22 32 24 34 26 36 */                 \
+                                                                  \
+  "vld1.8 d4, [%[din0_ptr]] \n"                                   \
+  "vld1.8 d5, [%[din1_ptr]] \n"                                   \
+  "vld1.8 d6, [%[din2_ptr]] \n"                                   \
+  "vld1.8 d7, [%[din3_ptr]] \n"                                   \
+                                                                  \
+  "vtrn.16 d0, d2 \n" /*00 10 20 30 04 14 24 34*/                 \
+  "vtrn.16 d1, d3 \n" /* 01 11 21 31 05 15 25 35 */               \
+  "vtrn.8 d4, d5 \n"  /*40 50 02 12 04 14 06 16*/                 \
+  "vtrn.8 d6, d7 \n"  /*60 70 22 32 24 34 26 36 */                \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "vtrn.16 d4, d6 \n" /*40 50 60 70 04 14 24 34*/                 \
+  "vtrn.16 d5, d7 \n" /* 41 51 61 71 05 15 25 35 */               \
+                                                                  \
+  "vtrn.32 d0, d4 \n" /*00 10 20 30 40 50 60 70*/                 \
+  "vtrn.32 d1, d5 \n" /* 01 11 21 31 41 51 61 71 */               \
+  "vtrn.32 d2, d6 \n" /*02 12 22 32 42 52 62 72*/                 \
+  "vtrn.32 d3, d7 \n" /* 03 11 21 33 43 53 63 73 */               \
+                                                                  \
+  "subs %[cnt], %[cnt], #1 \n"                                    \
+  "vst1.8  {d0}, [%[out0_ptr]]! \n"                               \
+  "vst1.8  {d1}, [%[out1_ptr]]! \n"                               \
+  "vst1.8  {d2}, [%[out2_ptr]]! \n"                               \
+  "vst1.8  {d3}, [%[out3_ptr]]! \n"                               \
+  "vst1.8  {d4}, [%[out4_ptr]]! \n"                               \
+  "vst1.8  {d5}, [%[out5_ptr]]! \n"                               \
+  "vst1.8  {d6}, [%[out6_ptr]]! \n"                               \
+  "vst1.8  {d7}, [%[out7_ptr]]! \n"                               \
+  "bne 1b \n"
+
+#endif
+template <>
+void NCHW2NHWC<float>(int N, int C, int size, const float* X, float* Y) {
+  int cnt = C >> 2;
+  int remain = C % 4;
+  int sum = C * size;
+  int stride = size << 4;  // 4 * size
+  int stride_w = stride >> 2;
+  for (int n = 0; n < N; n++) {
+    const float* din = X + n * sum;
+    float* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < size - 3; s += 4) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + size;
+      const float* din2_ptr = din1_ptr + size;
+      const float* din3_ptr = din2_ptr + size;
+      float* out0_ptr = dout + s * C;
+      float* out1_ptr = out0_ptr + C;
+      float* out2_ptr = out1_ptr + C;
+      float* out3_ptr = out2_ptr + C;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12");
+#else
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const float* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        din0_ptr += size;
+      }
+    }
+    // remain size
+    for (; s < size; s++) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + size;
+      const float* din2_ptr = din1_ptr + size;
+      const float* din3_ptr = din2_ptr + size;
+      float* out0_ptr = dout + s * C;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        din0_ptr += stride_w;
+        din1_ptr += stride_w;
+        din2_ptr += stride_w;
+        din3_ptr += stride_w;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += size;
+      }
+    }
+  }
+}
+template <>
+void NCHW2NHWC<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
+  int cnt = C >> 3;
+  int remain = C % 8;
+  int sum = C * size;
+  int stride = size << 3;    // 8 * size
+  int stride_w = size << 4;  // 4 * size * 4
+  for (int n = 0; n < N; n++) {
+    const int8_t* din = X + n * sum;
+    int8_t* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < size - 7; s += 8) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + size;
+      const int8_t* din2_ptr = din1_ptr + size;
+      const int8_t* din3_ptr = din2_ptr + size;
+      int8_t* out0_ptr = dout + s * C;
+      int8_t* out1_ptr = out0_ptr + C;
+      int8_t* out2_ptr = out1_ptr + C;
+      int8_t* out3_ptr = out2_ptr + C;
+      int8_t* out4_ptr = out3_ptr + C;
+      int8_t* out5_ptr = out4_ptr + C;
+      int8_t* out6_ptr = out5_ptr + C;
+      int8_t* out7_ptr = out6_ptr + C;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel
+      for (int i = 0; i < remain; i++) {
+        const int8_t* ptr = din0_ptr;
+        *out0_ptr = *ptr++;
+        *out1_ptr = *ptr++;
+        *out2_ptr = *ptr++;
+        *out3_ptr = *ptr++;
+        din0_ptr += size;
+        *out4_ptr = *ptr++;
+        *out5_ptr = *ptr++;
+        *out6_ptr = *ptr++;
+        *out7_ptr = *ptr++;
+      }
+    }
+    // remain size
+    for (; s < size; s++) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + size;
+      const int8_t* din2_ptr = din1_ptr + size;
+      const int8_t* din3_ptr = din2_ptr + size;
+      const int8_t* din4_ptr = din3_ptr + size;
+      const int8_t* din5_ptr = din4_ptr + size;
+      const int8_t* din6_ptr = din5_ptr + size;
+      const int8_t* din7_ptr = din6_ptr + size;
+      int8_t* out0_ptr = dout + s * C;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        *out0_ptr++ = *din4_ptr;
+        *out0_ptr++ = *din5_ptr;
+        *out0_ptr++ = *din6_ptr;
+        *out0_ptr++ = *din7_ptr;
+        din0_ptr += stride;
+        din1_ptr += stride;
+        din2_ptr += stride;
+        din3_ptr += stride;
+        din4_ptr += stride;
+        din5_ptr += stride;
+        din6_ptr += stride;
+        din7_ptr += stride;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += size;
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<float>(int N, int C, int size, const float* X, float* Y) {
+  int cnt = size >> 2;
+  int remain = size % 4;
+  int sum = C * size;
+  int stride = C << 4;  // 4 * size
+  int stride_w = C << 2;
+  for (int n = 0; n < N; n++) {
+    const float* din = X + n * sum;
+    float* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < C - 3; s += 4) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + C;
+      const float* din2_ptr = din1_ptr + C;
+      const float* din3_ptr = din2_ptr + C;
+      float* out0_ptr = dout + s * size;
+      float* out1_ptr = out0_ptr + size;
+      float* out2_ptr = out1_ptr + size;
+      float* out3_ptr = out2_ptr + size;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11");
+#else
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const float* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        din0_ptr += C;
+      }
+    }
+    // remain size
+    for (; s < C; s++) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + C;
+      const float* din2_ptr = din1_ptr + C;
+      const float* din3_ptr = din2_ptr + C;
+      float* out0_ptr = dout + s * size;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        din0_ptr += stride_w;
+        din1_ptr += stride_w;
+        din2_ptr += stride_w;
+        din3_ptr += stride_w;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += C;
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
+  int cnt = size >> 3;
+  int remain = size % 8;
+  int sum = C * size;
+  int stride = C << 3;    // 8 * size
+  int stride_w = C << 4;  // 4 * size
+  for (int n = 0; n < N; n++) {
+    const int8_t* din = X + n * sum;
+    int8_t* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < C - 7; s += 8) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + C;
+      const int8_t* din2_ptr = din1_ptr + C;
+      const int8_t* din3_ptr = din2_ptr + C;
+      const int8_t* din4_ptr = din3_ptr + C;
+      const int8_t* din5_ptr = din4_ptr + C;
+      const int8_t* din6_ptr = din5_ptr + C;
+      const int8_t* din7_ptr = din6_ptr + C;
+      int8_t* out0_ptr = dout + s * size;
+      int8_t* out1_ptr = out0_ptr + size;
+      int8_t* out2_ptr = out1_ptr + size;
+      int8_t* out3_ptr = out2_ptr + size;
+      int8_t* out4_ptr = out3_ptr + size;
+      int8_t* out5_ptr = out4_ptr + size;
+      int8_t* out6_ptr = out5_ptr + size;
+      int8_t* out7_ptr = out6_ptr + size;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const int8_t* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        *out4_ptr++ = *ptr++;
+        *out5_ptr++ = *ptr++;
+        *out6_ptr++ = *ptr++;
+        *out7_ptr++ = *ptr++;
+        din0_ptr += C;
+      }
+    }
+    // remain size
+    for (; s < C; s++) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + C;
+      const int8_t* din2_ptr = din1_ptr + C;
+      const int8_t* din3_ptr = din2_ptr + C;
+      const int8_t* din4_ptr = din3_ptr + C;
+      const int8_t* din5_ptr = din4_ptr + C;
+      const int8_t* din6_ptr = din5_ptr + C;
+      const int8_t* din7_ptr = din6_ptr + C;
+      int8_t* out0_ptr = dout + s * size;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        *out0_ptr++ = *din4_ptr;
+        *out0_ptr++ = *din5_ptr;
+        *out0_ptr++ = *din6_ptr;
+        *out0_ptr++ = *din7_ptr;
+        din0_ptr += stride;
+        din1_ptr += stride;
+        din2_ptr += stride;
+        din3_ptr += stride;
+        din4_ptr += stride;
+        din5_ptr += stride;
+        din6_ptr += stride;
+        din7_ptr += stride;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += C;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/layout.h b/lite/backends/arm/math/layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e2f8b78a280c513161a02bb3b3b479008145a
--- /dev/null
+++ b/lite/backends/arm/math/layout.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+template <typename T>
+void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y);
+
+template <typename T>
+void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
index 0d6eed9904902aa9539caf95172b0e4109e11f7d..092e6937c4fd4237410ff29565f418423494507f 100644
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
@@ -53,6 +53,38 @@ void sgemm_prepacked_8x12(bool is_transB,
                           bool has_bias,
                           bool has_relu,
                           ARMContext *ctx);
+
+void pack_m4(float *out,
+             const float *in,
+             float alpha,
+             int ldin,
+             int m0,
+             int mmax,
+             int k0,
+             int kmax);
+
+void pack_trans_m4(float *out,
+                   const float *in,
+                   float alpha,
+                   int ldin,
+                   int m0,
+                   int mmax,
+                   int k0,
+                   int kmax);
+void sgemm_prepacked_4x4(bool is_transB,
+                         int M,
+                         int N,
+                         int K,
+                         const float *A_packed,
+                         const float *B,
+                         int ldb,
+                         float beta,
+                         float *C,
+                         int ldc,
+                         const float *bias,
+                         bool has_bias,
+                         bool has_relu,
+                         ARMContext *ctx);
 #else
 // for kA72
 void prepackA_6x8(float *out,
@@ -139,13 +171,21 @@ void prepackA(float *out,
               bool is_trans,
               ARMContext *ctx) {
 #ifdef __aarch64__
-  if (is_trans) {
-    prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+  if (mmax <= 4) {
+    if (is_trans) {
+      pack_trans_m4(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    } else {
+      pack_m4(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    }
   } else {
-    prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    if (is_trans) {
+      prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    } else {
+      prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    }
   }
 #else
-  if (ctx->arch() == kA73) {
+  if (ctx->arch() == kA73 || mmax <= 4) {
     if (is_trans) {
       prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
     } else {
@@ -212,22 +252,39 @@ void sgemm_prepack(bool is_transB,
                    bool has_relu,
                    ARMContext *ctx) {
 #ifdef __aarch64__
-  sgemm_prepacked_8x12(is_transB,
-                       M,
-                       N,
-                       K,
-                       A_packed,
-                       B,
-                       ldb,
-                       beta,
-                       C,
-                       ldc,
-                       bias,
-                       has_bias,
-                       has_relu,
-                       ctx);
+  if (M <= 4) {
+    sgemm_prepacked_4x4(is_transB,
+                        M,
+                        N,
+                        K,
+                        A_packed,
+                        B,
+                        ldb,
+                        beta,
+                        C,
+                        ldc,
+                        bias,
+                        has_bias,
+                        has_relu,
+                        ctx);
+  } else {
+    sgemm_prepacked_8x12(is_transB,
+                         M,
+                         N,
+                         K,
+                         A_packed,
+                         B,
+                         ldb,
+                         beta,
+                         C,
+                         ldc,
+                         bias,
+                         has_bias,
+                         has_relu,
+                         ctx);
+  }
 #else   // armv7
-  if (ctx->arch() == kA73) {
+  if (ctx->arch() == kA73 || M <= 4) {
     sgemm_prepacked_4x8(is_transB,
                         M,
                         N,
@@ -522,6 +579,147 @@ void prepackA_8x12(float *dout,
     }
   }
 }
+void pack_m4(float *dout,
+             const float *inptr,
+             float alpha,
+             int ldin,
+             int m0,
+             int mmax,
+             int k0,
+             int kmax) {
+  int x_len = kmax - k0;
+  int stride = x_len * 4;
+  float zerobuff[x_len];  // NOLINT
+  memset(zerobuff, 0, sizeof(float) * x_len);
+  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
+
+#pragma omp parallel for
+  for (int y = m0; y < mmax; y += 4) {
+    float *outptr = dout + stride * (y - m0) / 4;
+
+    const float *inptr0 = inptr + y * ldin + k0;
+    const float *inptr1 = inptr0 + ldin;
+    const float *inptr2 = inptr1 + ldin;
+    const float *inptr3 = inptr2 + ldin;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]        \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+
+    int x = x_len;
+    //! cope with row index exceed real size, set to zero buffer
+    if ((y + 3) >= mmax) {
+      switch ((y + 3) - mmax) {
+        case 2:
+          inptr1 = zerobuff;
+        case 1:
+          inptr2 = zerobuff;
+        case 0:
+          inptr3 = zerobuff;
+        default:
+          break;
+      }
+    }
+    for (; x > 7; x -= 8) {
+      asm volatile(
+          "cbz    %w[has_alpha], 0f\n"            /* check alpha == 1.f? */
+          "dup    v31.4s, %w[alpha]\n"            /* alpha to vector */
+          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
+          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
+          "fmul   v0.4s,  v31.4s, v0.4s\n"        /* mul alpha */
+          "fmul   v1.4s,  v31.4s, v1.4s\n"        /* mul alpha */
+          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
+          "fmul   v2.4s,  v31.4s, v2.4s\n"        /* mul alpha */
+          "fmul   v3.4s,  v31.4s, v3.4s\n"        /* mul alpha */
+          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
+          "fmul   v4.4s,  v31.4s, v4.4s\n"        /* mul alpha */
+          "fmul   v5.4s,  v31.4s, v5.4s\n"        /* mul alpha */
+          "fmul   v6.4s,  v31.4s, v6.4s\n"        /* mul alpha */
+          "fmul   v7.4s,  v31.4s, v7.4s\n"        /* mul alpha */
+          "b 1f\n"                                /* to main process */
+          "0: \n"                                 /* alpha == 1 */
+          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
+          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
+          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
+          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
+          "1: \n"                                 /* main process */
+          "trn1   v8.4s, v0.4s, v2.4s\n"          /* a0b0a2b2*/
+          "trn2   v9.4s, v0.4s, v2.4s\n"          /* a1b1a3b3*/
+          "trn1   v10.4s, v1.4s, v3.4s\n"         /* a4b4a6b6*/
+          "trn2   v11.4s, v1.4s, v3.4s\n"         /* a5b5a7b7*/
+
+          "trn1   v12.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/
+          "trn2   v13.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/
+          "trn1   v14.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/
+          "trn2   v15.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/
+
+          "trn1   v0.2d, v8.2d, v12.2d\n"  /* a0b0c0d0 */
+          "trn1   v1.2d, v9.2d, v13.2d\n"  /* a1b1c1d1 */
+          "trn1   v2.2d, v10.2d, v14.2d\n" /* a4b4c4d4 */
+          "trn1   v3.2d, v11.2d, v15.2d\n" /* a5b5c5d5 */
+
+          "trn2   v4.2d, v8.2d, v12.2d\n"     /* a2b2c2d2 */
+          "trn2   v5.2d, v9.2d, v13.2d\n"     /* a3b3c3d3 */
+          "stp    q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/
+          "trn2   v6.2d, v10.2d, v14.2d\n"    /* a6b6c6d6 */
+          "trn2   v7.2d, v11.2d, v15.2d\n"    /* a7b7c7d7 */
+          "stp    q4, q5, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/
+          "stp    q2, q3, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/
+          "stp    q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr] "+r"(outptr)
+          : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha)
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "cc",
+            "memory");
+    }
+
+    for (; x > 0; x--) {
+      if (has_alpha) {
+        *outptr++ = *inptr0++ * alpha;
+        *outptr++ = *inptr1++ * alpha;
+        *outptr++ = *inptr2++ * alpha;
+        *outptr++ = *inptr3++ * alpha;
+      } else {
+        *outptr++ = *inptr0++;
+        *outptr++ = *inptr1++;
+        *outptr++ = *inptr2++;
+        *outptr++ = *inptr3++;
+      }
+    }
+  }
+}
 
 void prepackA_trans_8x12(float *outptr,
                          const float *in,
@@ -682,6 +880,128 @@ void prepackA_trans_8x12(float *outptr,
     }
   }
 }
+void pack_trans_m4(float *outptr,
+                   const float *in,
+                   float alpha,
+                   int ldin,
+                   int m0,
+                   int mmax,
+                   int k0,
+                   int kmax) {
+  auto inptr = in + k0 * ldin + m0;
+  uint32_t mask_buffer[4] = {0, 1, 2, 3};
+  int x_len = mmax - m0;
+  int y_len = kmax - k0;
+  int right_remain = x_len - 4 * (x_len / 4);
+  int stride_out = 4 * y_len;
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask1 =
+      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
+
+  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
+  float32x4_t valpha = vdupq_n_f32(alpha);
+
+#pragma omp parallel for
+  for (int y = 0; y < y_len - 3; y += 4) {
+    const float *ptr0 = inptr + y * ldin;
+    const float *ptr1 = ptr0 + ldin;
+    const float *ptr2 = ptr1 + ldin;
+    const float *ptr3 = ptr2 + ldin;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]        \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
+        : "memory");
+
+    float *outptr_row_col = outptr + y * 4;
+    int i = 0;
+    for (; i < x_len - 3; i += 4) {
+      float32x4_t vr00 = vld1q_f32(ptr0);
+      float32x4_t vr10 = vld1q_f32(ptr1);
+      float32x4_t vr20 = vld1q_f32(ptr2);
+      float32x4_t vr30 = vld1q_f32(ptr3);
+      if (has_alpha) {
+        vr00 = vmulq_f32(vr00, valpha);
+        vr10 = vmulq_f32(vr10, valpha);
+        vr20 = vmulq_f32(vr20, valpha);
+        vr30 = vmulq_f32(vr30, valpha);
+      }
+
+      vst1q_f32(outptr_row_col, vr00);
+      vst1q_f32(outptr_row_col + 4, vr10);
+      vst1q_f32(outptr_row_col + 8, vr20);
+      vst1q_f32(outptr_row_col + 12, vr30);
+
+      ptr0 += 4;
+      ptr1 += 4;
+      ptr2 += 4;
+      ptr3 += 4;
+
+      outptr_row_col += stride_out;
+    }
+    if (right_remain > 0) {
+      float32x4_t vr00 = vld1q_f32(ptr0);
+      float32x4_t vr10 = vld1q_f32(ptr1);
+      float32x4_t vr20 = vld1q_f32(ptr2);
+      float32x4_t vr30 = vld1q_f32(ptr3);
+
+      if (has_alpha) {
+        vr00 = vmulq_f32(vr00, valpha);
+        vr10 = vmulq_f32(vr10, valpha);
+        vr20 = vmulq_f32(vr20, valpha);
+        vr30 = vmulq_f32(vr30, valpha);
+      }
+
+      float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero);
+      float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero);
+      float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero);
+      float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero);
+
+      vst1q_f32(outptr_row_col, vr00_1);
+      vst1q_f32(outptr_row_col + 4, vr10_1);
+      vst1q_f32(outptr_row_col + 8, vr20_1);
+      vst1q_f32(outptr_row_col + 12, vr30_1);
+    }
+  }
+
+#pragma omp parallel for
+  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
+    const float *ptr0 = inptr + y * ldin;
+    float *outptr_row_col = outptr + y * 4;
+    int i = 0;
+    for (; i < x_len - 3; i += 4) {
+      float32x4_t vr0 = vld1q_f32(ptr0);
+      if (has_alpha) {
+        vr0 = vmulq_f32(vr0, valpha);
+      }
+      vst1q_f32(outptr_row_col, vr0);
+
+      ptr0 += 4;
+
+      outptr_row_col += stride_out;
+    }
+    if (right_remain > 0) {
+      float32x4_t vr0 = vld1q_f32(ptr0);
+
+      if (has_alpha) {
+        vr0 = vmulq_f32(vr0, valpha);
+      }
+
+      float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero);
+
+      vst1q_f32(outptr_row_col, vr0_1);
+    }
+  }
+}
 
 #else  // __aarch64__
 void prepackA_6x8(float* outptr,
@@ -2592,6 +2912,292 @@ void sgemm_prepacked_8x12(bool is_transB,
     }
   }
 }
+
+void sgemm_prepacked_4x4(bool is_transB,
+                         int M,
+                         int N,
+                         int K,
+                         const float *A_packed,
+                         const float *B,
+                         int ldb,
+                         float beta,
+                         float *C,
+                         int ldc,
+                         const float *bias,
+                         bool has_bias,
+                         bool has_relu,
+                         ARMContext *ctx) {
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  auto workspace = ctx->workspace_data<float>();
+  int threads = ctx->threads();
+
+  const int n_block = 4;
+  const int m_block = 4;
+  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
+  int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block));
+  x_block /= n_block;
+  x_block *= n_block;
+  int x_num = (N + (x_block - 1)) / x_block;
+  x_block = (N + x_num - 1) / x_num;
+  x_block = (x_block + n_block - 1) / n_block;
+  x_block *= n_block;
+  x_block = x_block < n_block ? n_block : x_block;
+
+  // unroll 2 loop
+  int tail_pre = (K & (KBLOCK - 1));
+  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
+  if (tail_pre == 0) {
+    tail_pre = KBLOCK;
+  }
+
+  bool flag_p_remain = false;
+  int remain = 0;
+
+  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
+  //! apanel is pre_compute outside gemm
+  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
+    unsigned int xmax = x0 + x_block;
+    if (xmax > N) {
+      xmax = N;
+    }
+    int bblocks = (xmax - x0 + n_block - 1) / n_block;
+    remain = xmax - x0 - (bblocks - 1) * n_block;
+    if (remain > 0) {
+      flag_p_remain = true;
+    }
+    //! load bpanel
+    float *b_pannel = workspace;
+    if (is_transB) {
+      pack_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
+    } else {
+      pack_trans_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
+    }
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int y = 0; y < M; y += m_block) {
+      unsigned int ymax = y + m_block;
+      if (ymax > M) {
+        ymax = M;
+      }
+
+      float bias_local[4] = {0};
+      if (has_bias) {
+        bias_local[0] = bias[y];
+        bias_local[1] = bias[y + 1];
+        bias_local[2] = bias[y + 2];
+        bias_local[3] = bias[y + 3];
+      }
+
+      float cout0[n_block];  // NOLINT
+      float cout1[n_block];  // NOLINT
+      float cout2[n_block];  // NOLINT
+      float cout3[n_block];  // NOLINT
+
+      float *c_ptr0 = C + y * ldc + x0;
+      float *c_ptr1 = c_ptr0 + ldc;
+      float *c_ptr2 = c_ptr1 + ldc;
+      float *c_ptr3 = c_ptr2 + ldc;
+
+      float *pout0 = c_ptr0;
+      float *pout1 = c_ptr1;
+      float *pout2 = c_ptr2;
+      float *pout3 = c_ptr3;
+
+      const float *a_ptr_l = A_packed + y * K;
+      const float *b_ptr_l = b_pannel;
+      for (int xb = 0; xb < bblocks; xb++) {
+        if ((y + 3) >= ymax) {
+          switch ((y + 3) - ymax) {
+            case 2:
+              c_ptr1 = cout1;
+            case 1:
+              c_ptr2 = cout2;
+            case 0:
+              c_ptr3 = cout3;
+            default:
+              break;
+          }
+        }
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          pout0 = c_ptr0;
+          pout1 = c_ptr1;
+          pout2 = c_ptr2;
+          pout3 = c_ptr3;
+
+          c_ptr0 = cout0;
+          c_ptr1 = cout1;
+          c_ptr2 = cout2;
+          c_ptr3 = cout3;
+          if (has_beta) {
+            for (int i = 0; i < remain; ++i) {
+              cout0[i] = pout0[i];
+              cout1[i] = pout1[i];
+              cout2[i] = pout2[i];
+              cout3[i] = pout3[i];
+            }
+          }
+        }
+        const float *a_ptr = a_ptr_l;
+        const float *b_ptr = b_ptr_l + xb * K * 4;
+        int tail = tail_pre;
+        int k = k_pre;
+        // clang-format off
+        asm volatile(
+            "prfm   pldl1keep, [%[a_ptr]]\n"       /* preload a*/
+            "ld1    {v2.4s}, [%[bias_ptr]]\n"         /* load bias to q2, q3*/
+            "dup    v8.4s,  v2.s[0]\n"               /* out0 = 0 */
+            "prfm   pldl1keep, [%[b_ptr]]\n"       /* preload b*/
+            "dup    v9.4s,  v2.s[1]\n"               /* out1 = 0*/
+            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
+            "dup    v10.4s, v2.s[2]\n"               /* out2 = 0*/
+            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
+            "dup    v11.4s, v2.s[3]\n"               /* out3 = 0*/
+            "cbz    %w[has_beta], 0f\n"            /* check beta == 0? */
+            /* process beta */
+            "dup    v7.4s, %w[beta]\n"                    /* beta to vector */
+            "ld1    {v0.4s}, [%[c_ptr0]]\n" /* load output r0 */
+            "ld1    {v1.4s}, [%[c_ptr1]]\n" /* load output r1 */
+            "fmla   v8.4s, v0.4s, v7.4s\n"  /* cr00 += beta * c_r00*/
+            "fmla   v9.4s, v1.4s, v7.4s\n"  /* cr10 += beta * c_r10*/
+            "ld1    {v2.4s}, [%[c_ptr2]]\n"
+            "ld1    {v3.4s}, [%[c_ptr3]]\n"
+            "fmla   v10.4s, v2.4s, v7.4s\n" /* cr20 += beta * c_r20*/
+            "fmla   v11.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/
+
+            "0: \n"                          /* check loop count */
+            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a10 to q0, q1*/
+            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
+            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
+            /* main loop */
+            /* unrool 0*/
+            "1:\n"                              /* main loop */
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla  	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+            
+            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla	v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5 */
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+            "ldp	q4, q5, [%[b_ptr]], #32\n"    /* load b0, b1 to q4, q5*/
+
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b2 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b2 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b2 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b2 =q6*/
+            "ldp	q0, q1, [%[a_ptr]], #32\n"    /* load a00, a10 to q0, q1 */
+
+            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
+            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
+            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
+            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
+            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/
+
+            "bne	1b\n"
+            "2:\n"                            /* process tail*/
+            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
+            "beq		3f\n"                     /*jump to tail = 1*/
+            /* final unrool 0*/
+            /* unrool 0, tail > 1*/
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+
+            "beq		4f\n"                     /*jump to tail = 2*/
+            /* unrool 1, tail > 2*/
+            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
+
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
+            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
+
+            "beq		5f\n"                       /*jump to tail = 3*/
+            /* unrool 2, tail = 4*/
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
+
+            /* unrool 3, tail = 4*/
+
+            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
+            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
+            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
+            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/
+
+            "b		11f\n"
+            /* tails==1 final tail*/
+            "3: \n"                            /* tail=1*/
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+
+            "b		11f\n"
+            /* tails==2 final tail*/
+            "4:\n"                              /* tail = 2*/
+
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+
+            "b		11f\n"
+            /* tails==3 final tail*/
+            "5:\n"                              /* tail = 3*/
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
+
+            "11: \n"                            /* check if relu */
+            "cbz    %w[relu],   12f\n"          /* skip relu */
+            "movi   v2.4s, #0\n"                /* for relu*/
+            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
+            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
+            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
+            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
+            "12: \n"
+            "st1 {v8.4s}, [%[c_ptr0]], #16\n"   /* store r0 */
+            "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
+            "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
+            "st1 {v11.4s}, [%[c_ptr3]], #16\n" /* store r3 */
+
+            : [a_ptr] "+r"(a_ptr),
+              [b_ptr] "+r"(b_ptr),
+              [k] "+r"(k),
+              [tail] "+r"(tail),
+              [c_ptr0] "+r"(c_ptr0),
+              [c_ptr1] "+r"(c_ptr1),
+              [c_ptr2] "+r"(c_ptr2),
+              [c_ptr3] "+r"(c_ptr3)
+            : [bias_ptr] "r"(bias_local),
+              [relu] "r"(has_relu),
+              [has_beta] "r"(has_beta),
+              [beta] "r"(beta)
+            : "cc","memory",
+              "v0","v1","v2","v3","v4","v5","v6","v7",
+              "v8","v9","v10","v11");
+        // clang-format on
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          for (int i = 0; i < remain; ++i) {
+            *pout0++ = cout0[i];
+            *pout1++ = cout1[i];
+            *pout2++ = cout2[i];
+            *pout3++ = cout3[i];
+          }
+        }
+      }
+    }
+  }
+}
 #else  // __aarch64__
 /**
  * \brief gemm with ablock = 6, bblock = 8, output 6x8
diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8087e0337bda0866f5d399a07ecb674f0fa55a3e
--- /dev/null
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
@@ -0,0 +1,1171 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void loadb_c4(float* out,
+              const float* in,
+              const int xstart,
+              const int xend,
+              const int k_round,
+              const int n) {
+  const int xlen = (xend - xstart + NBLOCK_C4 - 1) / NBLOCK_C4 * NBLOCK_C4;
+  int xloop = xlen / NBLOCK_C4;
+  const int flag_remain = n < xstart + xlen;
+  int remain = 0;
+  int remain4 = 0;
+  int remain1 = 0;
+  if (flag_remain) {
+    remain = (n - xstart) - (xloop - 1) * NBLOCK_C4;
+    remain4 = remain >> 2;
+    remain1 = remain & 3;
+    xloop -= 1;
+  }
+  const int ldo = NBLOCK_C4 * k_round;
+  const int kloop = k_round >> 2;
+  in += xstart * 4;
+  if (xloop > 0) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out + 4 * NBLOCK_C4 * i;
+      const float* in_ptr = in + i * 4 * n;
+      for (int j = 0; j < xloop; ++j) {
+        float* out_p = out_ptr + j * ldo;
+#ifdef __aarch64__
+        asm volatile(
+            "ld1 {v0.4s, v1.4s}, [%[in]],  #32  \n"
+            "ld1 {v2.4s, v3.4s}, [%[in]],  #32  \n"
+            "st1 {v0.4s, v1.4s}, [%[out]], #32  \n"
+            "ld1 {v4.4s, v5.4s}, [%[in]],  #32  \n"
+            "st1 {v2.4s, v3.4s}, [%[out]], #32  \n"
+            "ld1 {v6.4s, v7.4s}, [%[in]],  #32  \n"
+            "st1 {v4.4s, v5.4s}, [%[out]], #32  \n"
+            "st1 {v6.4s, v7.4s}, [%[out]], #32  \n"
+            : [in] "+r"(in_ptr), [out] "+r"(out_p)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+#else
+        asm volatile(
+            "vld1.32 {d0-d3},   [%[in]]!  \n"
+            "vld1.32 {d4-d7},   [%[in]]!  \n"
+            "vst1.32 {d0-d3},   [%[out]]! \n"
+            "vld1.32 {d8-d11},  [%[in]]!  \n"
+            "vst1.32 {d4-d7},   [%[out]]! \n"
+            "vld1.32 {d12-d15}, [%[in]]!  \n"
+            "vst1.32 {d8-d11},  [%[out]]! \n"
+            "vst1.32 {d12-d15}, [%[out]]! \n"
+            : [in] "+r"(in_ptr), [out] "+r"(out_p)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+#endif  // __aarch674__
+      }
+    }
+  }
+  float* out_remain4 = out + xloop * k_round * NBLOCK_C4;
+  const float* in_remain4 = in + xloop * NBLOCK_C4 * 4;
+  if (remain4) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out_remain4 + 4 * 4 * i;
+      const float* in_ptr = in_remain4 + i * 4 * n;
+#ifdef __aarch64__
+      asm volatile(
+          "ld1 {v0.4s, v1.4s}, [%[in]], #32  \n"
+          "ld1 {v2.4s, v3.4s}, [%[in]], #32  \n"
+          "st1 {v0.4s, v1.4s}, [%[out]], #32 \n"
+          "st1 {v2.4s, v3.4s}, [%[out]], #32 \n"
+          : [in] "+r"(in_ptr), [out] "+r"(out_ptr)
+          :
+          : "v0", "v1", "v2", "v3");
+#else
+      asm volatile(
+          "vld1.32 {d0-d3}, [%[in]]!  \n"
+          "vld1.32 {d4-d7}, [%[in]]!  \n"
+          "vst1.32 {d0-d3}, [%[out]]! \n"
+          "vst1.32 {d4-d7}, [%[out]]! \n"
+          : [in] "+r"(in_ptr), [out] "+r"(out_ptr)
+          :
+          : "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+    }
+  }
+  float* out_remain1 = out_remain4 + remain4 * k_round * 4;
+  const float* in_remain1 = in_remain4 + remain4 * 4 * 4;
+  if (remain1) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out_remain1 + 4 * remain1 * i;
+      const float* in_ptr = in_remain1 + i * 4 * n;
+      for (int j = 0; j < remain1; ++j) {
+        float32x4_t vin = vld1q_f32(in_ptr);
+        in_ptr += 4;
+        vst1q_f32(out_ptr, vin);
+        out_ptr += 4;
+      }
+    }
+  }
+}
+
+void sgemm_prepack_c4_common(int M,
+                             int N,
+                             int K,
+                             const float* A_packed,
+                             const float* B,
+                             float* C,
+                             const float* bias,
+                             bool has_bias,
+                             bool has_relu,
+                             ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  int threads = ctx->threads();
+  auto workspace = ctx->workspace_data<float>();
+  // l2 = ablock * K * threads + K * bchunk_w + threads * ablock * bchunk_w;
+  int bchunk_w = (l2_cache - threads * k_round * sizeof(float)) /
+                 ((k_round + threads * MBLOCK_C4) * sizeof(float));
+  bchunk_w = bchunk_w > N ? N : bchunk_w;
+  bchunk_w = bchunk_w / NBLOCK_C4 * NBLOCK_C4;
+  bchunk_w = bchunk_w > NBLOCK_C4 ? bchunk_w : NBLOCK_C4;
+  int bchunk_loop = (N + bchunk_w - 1) / bchunk_w;
+
+  const int h_loop = m_round >> 2;  // MBLOCK_C4 == 4;
+  const int kcnt = (k_round + KBLOCK_C4 - 1) / KBLOCK_C4;
+  const int ldc = N * 4;
+  const int lda = k_round * 4;
+  float bias_buf[m_round];  // NOLINT
+  if (has_bias) {
+    memcpy(bias_buf, bias, M * sizeof(float));
+    memset(bias_buf + M, 0, (m_round - M) * sizeof(float));
+  } else {
+    memset(bias_buf, 0, m_round * sizeof(float));
+  }
+  // bchunk_loop
+  float* c = C;
+  for (int n = 0; n < bchunk_loop; ++n) {
+    int x_start = n * bchunk_w;
+    int x_end = x_start + bchunk_w;
+    int w_loop = bchunk_w / NBLOCK_C4;
+    int flag_remain = 0;
+    int w_loop4 = 0;
+    int remain = 0;
+    if (x_end > N) {
+      w_loop = (N - x_start) / NBLOCK_C4;
+      int w_loop_rem = (N - x_start) - w_loop * NBLOCK_C4;
+      w_loop4 = w_loop_rem >> 2;
+      remain = w_loop_rem & 3;
+      x_end = N;
+      flag_remain = 1;
+    }
+    float* bchunk = workspace;
+    loadb_c4(bchunk, B, x_start, x_end, k_round, N);
+    float* cchunk = c + n * bchunk_w * 4;
+    int has_remain = (n == bchunk_loop - 1) && flag_remain;
+#pragma omp parallel for num_threads(threads)
+    for (int h = 0; h < h_loop; ++h) {
+      float* bias_h = bias_buf + h * 4;
+#ifdef __aarch64__
+      float32x4_t vzero = vdupq_n_f32(0.f);
+      float32x4_t vbias = vld1q_f32(bias_h);
+#endif
+      const float* ablock = A_packed + h * lda;
+      const float* bblock = bchunk;
+      float* cblock = cchunk + h * ldc;
+      for (int w = 0; w < w_loop; ++w) {
+        int cnt = kcnt;
+        const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+        asm volatile(
+            "prfm pldl1keep, [%[a]]         \n"
+            "prfm pldl1keep, [%[b]]         \n"
+            "prfm pldl1keep, [%[b], #64]    \n"
+            "mov  v9.16b,   %[vbias].16b    \n" /* mov bias to c0*/
+            "mov  v10.16b,  %[vbias].16b    \n" /* mov bias to c1*/
+            "mov  v11.16b,  %[vbias].16b    \n" /* mov bias to c2*/
+            "mov  v12.16b,  %[vbias].16b    \n" /* mov bias to c3*/
+            /* load a0a1 to v1-v2  */
+            "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+            "mov  v13.16b,  %[vbias].16b    \n" /* mov bias to c4*/
+            "mov  v14.16b,  %[vbias].16b    \n" /* mov bias to c5*/
+            "mov  v15.16b,  %[vbias].16b    \n" /* mov bias to c6*/
+            "mov  v16.16b,  %[vbias].16b    \n" /* mov bias to c7*/
+            "1:\n"
+            /* load b0b1b2b3 to v5-v8 */
+            "ld1   {v5.4s, v6.4s}, [%[b]], #32 \n"
+            "ld1   {v7.4s, v8.4s}, [%[b]], #32 \n"
+            "prfm  pldl1keep, [%[b]]        \n"
+            "fmla  v9.4s,  v1.4s, v5.s[0]   \n"
+            "fmla  v10.4s, v1.4s, v6.s[0]   \n"
+            "fmla  v11.4s, v1.4s, v7.s[0]   \n"
+            "fmla  v12.4s, v1.4s, v8.s[0]   \n"
+            /* load b4b5b6b7 to v25-v28 */
+            "ld1   {v25.4s, v26.4s}, [%[b]], #32 \n"
+            "ld1   {v27.4s, v28.4s}, [%[b]], #32 \n"
+            "prfm  pldl1keep, [%[a], #32]   \n"
+            "fmla  v9.4s,  v2.4s, v5.s[1]   \n"
+            "fmla  v10.4s, v2.4s, v6.s[1]   \n"
+            "fmla  v11.4s, v2.4s, v7.s[1]   \n"
+            "fmla  v12.4s, v2.4s, v8.s[1]   \n"
+            "prfm  pldl1keep, [%[b], #64]   \n"
+            "fmla  v13.4s, v1.4s, v25.s[0]  \n"
+            "fmla  v14.4s, v1.4s, v26.s[0]  \n"
+            "fmla  v15.4s, v1.4s, v27.s[0]  \n"
+            "fmla  v16.4s, v1.4s, v28.s[0]  \n"
+            /* load a2a3 to v3-v4 */
+            "ld1   {v3.4s, v4.4s},  [%[a]], #32 \n"
+            "prfm  pldl1keep, [%[b], #128]  \n"
+            "fmla  v13.4s, v2.4s, v25.s[1]  \n"
+            "fmla  v14.4s, v2.4s, v26.s[1]  \n"
+            "fmla  v15.4s, v2.4s, v27.s[1]  \n"
+            "fmla  v16.4s, v2.4s, v28.s[1]  \n"
+            "subs  %w[cnt], %w[cnt], #1     \n"
+            "fmla  v9.4s,  v3.4s, v5.s[2]   \n"
+            "fmla  v10.4s, v3.4s, v6.s[2]   \n"
+            "fmla  v11.4s, v3.4s, v7.s[2]   \n"
+            "fmla  v12.4s, v3.4s, v8.s[2]   \n"
+            "fmla  v13.4s, v3.4s, v25.s[2]  \n"
+            "fmla  v14.4s, v3.4s, v26.s[2]  \n"
+            "fmla  v15.4s, v3.4s, v27.s[2]  \n"
+            "fmla  v16.4s, v3.4s, v28.s[2]  \n"
+            /* load a0a1 to v1-v2 */
+            "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+            "fmla  v9.4s,  v4.4s, v5.s[3]   \n"
+            "fmla  v10.4s, v4.4s, v6.s[3]   \n"
+            "fmla  v11.4s, v4.4s, v7.s[3]   \n"
+            "fmla  v12.4s, v4.4s, v8.s[3]   \n"
+
+            "fmla  v13.4s, v4.4s, v25.s[3]  \n"
+            "fmla  v14.4s, v4.4s, v26.s[3]  \n"
+            "fmla  v15.4s, v4.4s, v27.s[3]  \n"
+            "fmla  v16.4s, v4.4s, v28.s[3]  \n"
+            "bne   1b\n"
+            "cbz   %w[relu], 2f             \n"
+            "fmax  v9.4s,  v9.4s,  %[vzero].4s  \n"
+            "fmax  v10.4s, v10.4s, %[vzero].4s  \n"
+            "fmax  v11.4s, v11.4s, %[vzero].4s  \n"
+            "fmax  v12.4s, v12.4s, %[vzero].4s  \n"
+            "fmax  v13.4s, v13.4s, %[vzero].4s  \n"
+            "fmax  v14.4s, v14.4s, %[vzero].4s  \n"
+            "fmax  v15.4s, v15.4s, %[vzero].4s  \n"
+            "fmax  v16.4s, v16.4s, %[vzero].4s  \n"
+            "2:\n"
+            "st1   {v9.4s,  v10.4s, v11.4s, v12.4s}, [%[c]], #64  \n"
+            "st1   {v13.4s, v14.4s, v15.4s, v16.4s}, [%[c]], #64  \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), [relu] "r"(has_relu), 
+              [vbias] "w"(vbias), [vzero] "w" (vzero) 
+            : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", 
+              "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 
+              "v25", "v26", "v27", "v28", "cc", "memory");
+#else
+        asm volatile(
+            "vld1.32  {d6-d7}, [%[bias]] \n"
+            "pld [%[a]]  \n"
+            "pld [%[b]]  \n"
+            "pld [%[b], #64]  \n"
+            "vmov.32  q8,   q3   \n" /* mov bias to c0*/
+            "vmov.32  q9,   q3   \n" /* mov bias to c1*/
+            "vmov.32  q10,  q3   \n" /* mov bias to c2*/
+            "vmov.32  q11,  q3   \n" /* mov bias to c3*/
+            "vld1.32   {d0-d3}, [%[a]]! \n"
+            "vmov.32  q12,  q3   \n" /* mov bias to c4*/
+            "vmov.32  q13,  q3   \n" /* mov bias to c5*/
+            "vmov.32  q14,  q3   \n" /* mov bias to c6*/
+            "vmov.32  q15,  q3   \n" /* mov bias to c7*/
+            "1:\n"
+            /* c0c1c2c3 */
+            "vld1.32   {d8-d11},  [%[b]]! \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            "pld  [%[b]]                  \n"
+            "vmla.f32  q8,  q0, d8[0]     \n"
+            "vmla.f32  q9,  q0, d10[0]    \n"
+            "vmla.f32  q10, q0, d12[0]    \n"
+            "vmla.f32  q11, q0, d14[0]    \n"
+            "vld1.32   {d4-d7}, [%[a]]!   \n"
+            "vmla.f32  q8,  q1, d8[1]     \n"
+            "vmla.f32  q9,  q1, d10[1]    \n"
+            "vmla.f32  q10, q1, d12[1]    \n"
+            "vmla.f32  q11, q1, d14[1]    \n"
+            "pld [%[b], #64]              \n"
+            "vmla.f32  q8,  q2, d9[0]     \n"
+            "vmla.f32  q9,  q2, d11[0]    \n"
+            "vmla.f32  q10, q2, d13[0]    \n"
+            "vmla.f32  q11, q2, d15[0]    \n"
+            "subs  %[cnt], %[cnt], #1     \n"
+            "vmla.f32  q8,  q3, d9[1]     \n"
+            "vmla.f32  q9,  q3, d11[1]    \n"
+            "vld1.f32  {d8-d11}, [%[b]]!  \n"
+            "vmla.f32  q10, q3, d13[1]    \n"
+            "vmla.f32  q11, q3, d15[1]    \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            /* c4c5c6c7 */
+            "vmla.f32  q12, q0, d8[0]     \n"
+            "vmla.f32  q13, q0, d10[0]    \n"
+            "vmla.f32  q14, q0, d12[0]    \n"
+            "vmla.f32  q15, q0, d14[0]    \n"
+            "pld  [%[a], #32]             \n"
+            "vmla.f32  q12, q1, d8[1]     \n"
+            "vmla.f32  q13, q1, d10[1]    \n"
+            "vmla.f32  q14, q1, d12[1]    \n"
+            "vmla.f32  q15, q1, d14[1]    \n"
+            "vld1.32   {d0-d3}, [%[a]]!   \n"
+            "vmla.f32  q12, q2, d9[0]     \n"
+            "vmla.f32  q13, q2, d11[0]    \n"
+            "vmla.f32  q14, q2, d13[0]    \n"
+            "vmla.f32  q15, q2, d15[0]    \n"
+            "pld [%[b], #64]              \n"
+            "vmla.f32  q12, q3, d9[1]     \n"
+            "vmla.f32  q13, q3, d11[1]    \n"
+            "vmla.f32  q14, q3, d13[1]    \n"
+            "vmla.f32  q15, q3, d15[1]    \n"
+            "bne   1b\n"
+            "cmp   %[relu], #0            \n"
+            "beq   2f                     \n"
+            "vmov.u32 q0, #0              \n"
+            "vmax.f32  q8,   q8,   q0     \n"
+            "vmax.f32  q9,   q9,   q0     \n"
+            "vmax.f32  q10,  q10,  q0     \n"
+            "vmax.f32  q11,  q11,  q0     \n"
+            "vmax.f32  q12,  q12,  q0     \n"
+            "vmax.f32  q13,  q13,  q0     \n"
+            "vmax.f32  q14,  q14,  q0     \n"
+            "vmax.f32  q15,  q15,  q0     \n"
+            "2:\n"
+            "vst1.32   {d16-d19}, [%[c]]! \n"
+            "vst1.32   {d20-d23}, [%[c]]! \n"
+            "vst1.32   {d24-d27}, [%[c]]! \n"
+            "vst1.32   {d28-d31}, [%[c]]! \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), 
+              [relu] "r"(has_relu)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory");
+#endif
+        // clang-format on
+      }
+      if (has_remain) {
+        if (w_loop4 > 0) {
+          int cnt = kcnt;
+          const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+              "prfm pldl1keep, [%[a]]         \n"
+              "prfm pldl1keep, [%[b]]         \n"
+              "mov  v9.16b,   %[vbias].16b    \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b    \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vbias].16b    \n" /* mov bias to c2*/
+              "mov  v12.16b,  %[vbias].16b    \n" /* mov bias to c3*/
+              /* load a0a1 to v1-v2 */
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "1:\n"
+              /* load b0b1b2b3 to v5-v8 */
+              "ld1   {v5.4s, v6.4s}, [%[b]], #32 \n"
+              "ld1   {v7.4s, v8.4s}, [%[b]], #32 \n"
+              "fmla  v9.4s,  v1.4s, v5.s[0]   \n"
+              "fmla  v10.4s, v1.4s, v6.s[0]   \n"
+              "fmla  v11.4s, v1.4s, v7.s[0]   \n"
+              "fmla  v12.4s, v1.4s, v8.s[0]   \n"
+              /* load a2a3 to v3-v4 */
+              "ld1   {v3.4s, v4.4s},  [%[a]], #32 \n"
+              "prfm  pldl1keep, [%[a]]        \n"
+              "fmla  v9.4s,  v2.4s, v5.s[1]   \n"
+              "fmla  v10.4s, v2.4s, v6.s[1]   \n"
+              "fmla  v11.4s, v2.4s, v7.s[1]   \n"
+              "fmla  v12.4s, v2.4s, v8.s[1]   \n"
+              "prfm  pldl1keep, [%[b]]        \n"
+              "subs  %w[cnt], %w[cnt], #1     \n"
+              "fmla  v9.4s,  v3.4s, v5.s[2]   \n"
+              "fmla  v10.4s, v3.4s, v6.s[2]   \n"
+              "fmla  v11.4s, v3.4s, v7.s[2]   \n"
+              "fmla  v12.4s, v3.4s, v8.s[2]   \n"
+              /* load a0a1 to v1-v2 */
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,  v4.4s, v5.s[3]   \n"
+              "fmla  v10.4s, v4.4s, v6.s[3]   \n"
+              "fmla  v11.4s, v4.4s, v7.s[3]   \n"
+              "fmla  v12.4s, v4.4s, v8.s[3]   \n"
+              "bne   1b\n"
+              "cbz   %w[relu], 2f             \n"
+              "fmax  v9.4s,  v9.4s,  %[vzero].4s  \n"
+              "fmax  v10.4s, v10.4s, %[vzero].4s  \n"
+              "fmax  v11.4s, v11.4s, %[vzero].4s  \n"
+              "fmax  v12.4s, v12.4s, %[vzero].4s  \n"
+              "2:\n"
+              "st1   {v9.4s,  v10.4s, v11.4s, v12.4s}, [%[c]], #64  \n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h),
+                [relu] "r"(has_relu),
+                [vbias] "w"(vbias), 
+                [vzero] "w" (vzero)   
+              : "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v12", "cc", "memory");
+#else
+          asm volatile(
+            "pld [%[a]]  \n"
+            "pld [%[b]]  \n"
+            "vld1.32  {d6-d7}, [%[bias]] \n"
+            "vld1.32  {d0-d3}, [%[a]]!   \n" /* load a0 a1 */
+            "vmov.32  q8,   q3   \n"     /* mov bias to c0 */
+            "vmov.32  q9,   q3   \n"     /* mov bias to c1 */
+            "vmov.32  q10,  q3   \n"     /* mov bias to c2 */
+            "vmov.32  q11,  q3   \n"     /* mov bias to c3 */
+            "1:\n"
+            /* c0c1c2c3 */
+            "vld1.32   {d8-d11},  [%[b]]! \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            "pld  [%[b]]                  \n"
+            "vmla.f32  q8,  q0, d8[0]     \n"
+            "vmla.f32  q9,  q0, d10[0]    \n"
+            "vmla.f32  q10, q0, d12[0]    \n"
+            "vmla.f32  q11, q0, d14[0]    \n"
+            "vld1.32   {d4-d7}, [%[a]]!   \n"
+            "pld [%[a]]                   \n"
+            "vmla.f32  q8,  q1, d8[1]     \n"
+            "vmla.f32  q9,  q1, d10[1]    \n"
+            "vmla.f32  q10, q1, d12[1]    \n"
+            "vmla.f32  q11, q1, d14[1]    \n"
+            "subs  %[cnt], %[cnt], #1     \n"
+            "vmla.f32  q8,  q2, d9[0]     \n"
+            "vmla.f32  q9,  q2, d11[0]    \n"
+            "vmla.f32  q10, q2, d13[0]    \n"
+            "vmla.f32  q11, q2, d15[0]    \n"
+            "vld1.32   {d0-d3}, [%[a]]!   \n"
+            "vmla.f32  q8,  q3, d9[1]     \n"
+            "vmla.f32  q9,  q3, d11[1]    \n"
+            "vmla.f32  q10, q3, d13[1]    \n"
+            "vmla.f32  q11, q3, d15[1]    \n"
+            "bne   1b\n"
+            "cmp   %[relu], #0            \n"
+            "beq   2f                     \n"
+            "vmov.u32  q0, #0             \n"
+            "vmax.f32  q8,   q8,   q0     \n"
+            "vmax.f32  q9,   q9,   q0     \n"
+            "vmax.f32  q10,  q10,  q0     \n"
+            "vmax.f32  q11,  q11,  q0     \n"
+            "2:\n"
+            "vst1.32   {d16-d19}, [%[c]]! \n"
+            "vst1.32   {d20-d23}, [%[c]]! \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), [relu] "r"(has_relu)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10", "q11", "cc", "memory");
+#endif
+          // clang-format on
+        }
+        if (remain > 0) {
+          int cnt = kcnt;
+          const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+              "prfm pldl1keep, [%[a]]   \n"
+              "prfm pldl1keep, [%[b]]   \n"
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "cmp  %w[remain], #3      \n"
+              "beq  1f                  \n"
+              "cmp  %w[remain], #2      \n"
+              "beq  2f                  \n"
+              /* remain 1 */
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vzero].16b  \n" /* mov zero to c1*/
+              "3:                                 \n"
+              "ld1   {v5.4s}, [%[b]], #16         \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v2.4s,  v5.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v4.4s,  v5.s[3]     \n"
+              "bne   3b                           \n"
+              "fadd  v9.4s,   v9.4s,  v10.4s      \n"
+              "cbz   %w[relu], 6f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "6:                                 \n"
+              "st1   {v9.4s}, [%[c]], #16         \n"
+              "b     9f                           \n"
+              /* remain 2 */
+              "2:                           \n"
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b  \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vzero].16b  \n" /* mov zero to c2*/
+              "mov  v12.16b,  %[vzero].16b  \n" /* mov zero to c3*/
+              "4:                                 \n"
+              "ld1   {v5.4s,  v6.4s}, [%[b]], #32 \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v1.4s,  v6.s[0]     \n"
+              "fmla  v11.4s,  v2.4s,  v5.s[1]     \n"
+              "fmla  v12.4s,  v2.4s,  v6.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v3.4s,  v6.s[2]     \n"
+              "fmla  v11.4s,  v4.4s,  v5.s[3]     \n"
+              "fmla  v12.4s,  v4.4s,  v6.s[3]     \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "bne   4b                           \n"
+              "fadd  v9.4s,   v9.4s,  v11.4s      \n"
+              "fadd  v10.4s,  v10.4s, v12.4s      \n"
+              "cbz   %w[relu], 7f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "fmax  v10.4s,  v10.4s, %[vzero].4s \n"
+              "7:                                 \n"
+              "st1   {v9.4s, v10.4s}, [%[c]], #32 \n"
+              "b     9f                           \n"
+              /* remain 3 */
+              "1:                       \n"
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b  \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vbias].16b  \n" /* mov bias to c2*/
+              "5:                                 \n"
+              "ld1   {v5.4s,  v6.4s}, [%[b]], #32 \n"
+              "ld1   {v7.4s}, [%[b]], #16         \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v1.4s,  v6.s[0]     \n"
+              "fmla  v11.4s,  v1.4s,  v7.s[0]     \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v2.4s,  v5.s[1]     \n"
+              "fmla  v10.4s,  v2.4s,  v6.s[1]     \n"
+              "fmla  v11.4s,  v2.4s,  v7.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v3.4s,  v6.s[2]     \n"
+              "fmla  v11.4s,  v3.4s,  v7.s[2]     \n"
+              "prfm  pldl1keep, [%[a]]            \n"
+              "fmla  v9.4s,   v4.4s,  v5.s[3]     \n"
+              "fmla  v10.4s,  v4.4s,  v6.s[3]     \n"
+              "fmla  v11.4s,  v4.4s,  v7.s[3]     \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "bne   5b                           \n"
+              "cbz   %w[relu], 8f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "fmax  v10.4s,  v10.4s, %[vzero].4s \n"
+              "fmax  v11.4s,  v11.4s, %[vzero].4s \n"
+              "8:                                 \n"
+              "st1   {v9.4s, v10.4s}, [%[c]], #32 \n"
+              "st1   {v11.4s}, [%[c]], #16        \n"
+              "9:\n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h), [relu] "r"(has_relu), 
+                [remain] "r"(remain), [vbias] "w"(vbias), 
+                [vzero] "w" (vzero) 
+              : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", 
+                "v10", "v11", "v12", "cc","memory");
+#else
+          asm volatile(
+              "pld  [%[a]]  \n"
+              "pld  [%[b]]  \n"
+              "vld1.32  {d0-d1}, [%[bias]]  \n"
+              "vld1.32  {d2-d5}, [%[a]]!    \n"
+              "vmov.u32 q15,  #0            \n"
+              "cmp  %[remain], #3           \n"
+              "beq  1f                      \n"
+              "cmp  %[remain], #2           \n"
+              "beq  2f                      \n"
+              /* remain 1 */
+              "vmov.32  q9,   q0  \n" /* mov bias to c0*/
+              "vmov.32  q10,  q15 \n" /* mov zero to c1*/
+              "3:                             \n"
+              "vld1.32   {d10-d11}, [%[b]]!   \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,  q1,  d10[0]     \n"
+              "vmla.f32  q10, q2,  d10[1]     \n"
+              "subs   %[cnt],  %[cnt], #1     \n"
+              "vld1.32   {d2-d5},   [%[a]]!   \n"
+              "vmla.f32  q9,  q3,  d11[0]     \n"
+              "vmla.f32  q10, q4,  d11[1]     \n"
+              "bne   3b                       \n"
+              "vadd.f32  q9,  q9,  q10        \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  6f                        \n"
+              "vmax.f32  q9,  q9,  q15        \n"
+              "6:                             \n"
+              "vst1.32   {d18-d19}, [%[c]]!   \n"
+              "b     9f                       \n"
+              /* remain 2 */
+              "2:                             \n"
+              "vmov.u32  q9,  q0    \n" /* mov bias to c0*/
+              "vmov.u32  q10, q0    \n" /* mov bias to c1*/
+              "vmov.u32  q11, q15   \n" /* mov zero to c2*/
+              "vmov.u32  q12, q15   \n" /* mov zero to c3*/
+              "4:                             \n"
+              "vld1.32   {d10-d13}, [%[b]]!   \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,   q1,  d10[0]    \n"
+              "vmla.f32  q10,  q1,  d12[0]    \n"
+              "vmla.f32  q11,  q2,  d10[1]    \n"
+              "vmla.f32  q12,  q2,  d12[1]    \n"
+              "subs  %[cnt],  %[cnt], #1      \n"
+              "vmla.f32  q9,   q3,  d11[0]    \n"
+              "vmla.f32  q10,  q3,  d13[0]    \n"
+              "vmla.f32  q11,  q4,  d11[1]    \n"
+              "vmla.f32  q12,  q4,  d13[1]    \n"
+              "vld1.32   {d2-d5},   [%[a]]!   \n"
+              "bne   4b                       \n"
+              "vadd.f32  q9,   q9,  q11       \n"
+              "vadd.f32  q10,  q10, q12       \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  7f                        \n"
+              "vmax.f32  q9,   q9,  q15       \n"
+              "vmax.f32  q10,  q10, q15       \n"
+              "7:                             \n"
+              "vst1.32   {d18-d21}, [%[c]]!   \n"
+              "b     9f                       \n"
+              /* remain 3 */
+              "1:                             \n"
+              "vmov.u32  q9,   q0    \n" /* mov bias to c0*/
+              "vmov.u32  q10,  q0    \n" /* mov bias to c1*/
+              "vmov.u32  q11,  q0    \n" /* mov bias to c2*/
+              "5:                             \n"
+              "vld1.32   {d10-d13}, [%[b]]!   \n"
+              "vld1.32   {d14-d15}, [%[b]]!   \n"
+              "vmla.f32  q9,  q1,   d10[0]    \n"
+              "vmla.f32  q10, q1,   d12[0]    \n"
+              "vmla.f32  q11, q1,   d14[0]    \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,  q2,  d10[1]     \n"
+              "vmla.f32  q10, q2,  d12[1]     \n"
+              "vmla.f32  q11, q2,  d14[1]     \n"
+              "subs  %[cnt],  %[cnt], #1      \n"
+              "vmla.f32  q9,  q3,  d11[0]     \n"
+              "vmla.f32  q10, q3,  d13[0]     \n"
+              "vmla.f32  q11, q3,  d15[0]     \n"
+              "pld       [%[a]]               \n"
+              "vmla.f32  q9,  q4,  d11[1]     \n"
+              "vmla.f32  q10, q4,  d13[1]     \n"
+              "vmla.f32  q11, q4,  d15[1]     \n"
+              "vld1.32   {d2-d5},  [%[a]]!    \n"
+              "bne   5b                       \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  8f                        \n"
+              "vmax.f32  q9,  q9,  q15        \n"
+              "vmax.f32  q10, q10, q15        \n"
+              "vmax.f32  q11, q11, q15        \n"
+              "8:                             \n"
+              "vst1.32   {d18-d21}, [%[c]]!   \n"
+              "vst1.32   {d22-d23}, [%[c]]!   \n"
+              "9:\n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h), 
+                [relu] "r"(has_relu), 
+                [remain] "r"(remain)
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q9", 
+                "q10", "q11", "q12", "q15", "cc","memory");
+#endif
+          // clang-format on
+        }
+      }
+    }
+  }
+}
+
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            const float* bias,
+                            bool has_bias,
+                            bool has_relu,
+                            ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  const int mloop = m_round >> 2;
+  const int lda = 4 * k_round;
+  const int ldb_byte = 4 * N * sizeof(float);
+  const int kcnt = k_round >> 2;
+  float bias_buf[m_round];  // NOLINT
+  if (has_bias) {
+    memcpy(bias_buf, bias, M * sizeof(float));
+    memset(bias_buf + M, 0, (m_round - M) * sizeof(float));
+  } else {
+    memset(bias_buf, 0, m_round * sizeof(float));
+  }
+#ifdef __aarch64__
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  const float* bias_ptr = bias_buf;
+  for (int m = 0; m < mloop; ++m) {
+#ifdef __aarch64__
+    float32x4_t vbias = vld1q_f32(bias_ptr);
+#endif
+    const float* b = B;
+    int n = N;
+#ifdef __aarch64__
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0-c7*/
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vbias].16b \n"
+        "mov  v10.16b,  %[vbias].16b \n"
+        "mov  v11.16b,  %[vbias].16b \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "mov  v12.16b,  %[vbias].16b \n"
+        "mov  v13.16b,  %[vbias].16b \n"
+        "mov  v14.16b,  %[vbias].16b \n"
+        "mov  v15.16b,  %[vbias].16b \n"
+        "1:\n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmla v12.4s, v16.4s, v4.s[0] \n"
+        "fmla v13.4s, v16.4s, v5.s[0] \n"
+        "fmla v14.4s, v16.4s, v6.s[0] \n"
+        "fmla v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "fmax v9.4s,  v9.4s,  %[vzero].4s \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s \n"
+        "fmax v12.4s, v12.4s, %[vzero].4s \n"
+        "fmax v13.4s, v13.4s, %[vzero].4s \n"
+        "fmax v14.4s, v14.4s, %[vzero].4s \n"
+        "fmax v15.4s, v15.4s, %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        "st1  {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0-c3*/
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vbias].16b \n"
+        "mov  v10.16b,  %[vbias].16b \n"
+        "mov  v11.16b,  %[vbias].16b \n"
+        "1:\n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "fmax v9.4s,  v9.4s,  %[vzero].4s \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v8", "v9",
+          "v10", "v11", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0 */
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vzero].16b \n"
+        "1:\n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v17.4s, v0.s[1] \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "bne  1b                      \n"
+        "fadd v8.4s,  v8.4s,  v9.4s   \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s}, [%[c]], #16    \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v8", "v9", "v16", "v17", 
+          "v18", "v19", "cc", "memory"
+      );
+      b += 4;
+    }
+#else
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "vld1.32  {d6-d7}, [%[bias]] \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]!  \n"
+        /* mov bias to c0-c7*/
+        "vmov.u32   q8,    q3 \n"
+        "vmov.u32   q9,    q3 \n"
+        "vmov.u32   q10,   q3 \n"
+        "vmov.u32   q11,   q3 \n"
+        /* load b0, b1 */
+        "vld1.32  {d0-d3}, [%[b]]! \n"
+        "vmov.u32   q12,   q3 \n"
+        "vmov.u32   q13,   q3 \n"
+        "vmov.u32   q14,   q3 \n"
+        "vmov.u32   q15,   q3 \n"
+        "1:\n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "vmla.f32   q8,   q4,   d0[0]  \n"
+        "vmla.f32   q9,   q4,   d2[0]  \n"
+        "vmla.f32   q10,  q4,   d4[0]  \n"
+        "vmla.f32   q11,  q4,   d6[0]  \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmla.f32   q12,  q4,   d0[0]  \n"
+        "vmla.f32   q13,  q4,   d2[0]  \n"
+        "vmla.f32   q14,  q4,   d4[0]  \n"
+        "vmla.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "bne  1b                       \n"
+        "cmp  %[relu],  #0             \n"
+        "beq  2f                       \n"
+        "vmov.u32   q0,   #0           \n"
+        "vmax.f32   q8,   q8,   q0     \n"
+        "vmax.f32   q9,   q9,   q0     \n"
+        "vmax.f32   q10,  q10,  q0     \n"
+        "vmax.f32   q11,  q11,  q0     \n"
+        "vmax.f32   q12,  q12,  q0     \n"
+        "vmax.f32   q13,  q13,  q0     \n"
+        "vmax.f32   q14,  q14,  q0     \n"
+        "vmax.f32   q15,  q15,  q0     \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!   \n"
+        "vst1.32  {d20-d23}, [%[c]]!   \n"
+        "vst1.32  {d24-d27}, [%[c]]!   \n"
+        "vst1.32  {d28-d31}, [%[c]]!   \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "vld1.32  {d24-d25}, [%[bias]] \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11},  [%[a]]!   \n"
+        /* mov bias to c0-c3*/
+        "vmov.u32   q8,   q12 \n"
+        "vmov.u32   q9,   q12 \n"
+        "vmov.u32   q10,  q12 \n"
+        "vmov.u32   q11,  q12 \n"
+        "vmov.u32   q13,  #0  \n"
+        "1:\n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "vmla.f32  q8,   q4, d0[0]  \n"
+        "vmla.f32  q9,   q4, d2[0]  \n"
+        "vmla.f32  q10,  q4, d4[0]  \n"
+        "vmla.f32  q11,  q4, d6[0]  \n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "bne  1b                    \n"
+        "cmp  %[relu],  #0          \n"
+        "beq  2f                    \n"
+        "vmax.f32 q8,   q8,   q13   \n"
+        "vmax.f32 q9,   q9,   q13   \n"
+        "vmax.f32 q10,  q10,  q13   \n"
+        "vmax.f32 q11,  q11,  q13   \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!\n"
+        "vst1.32  {d20-d23}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "vld1.32  {d14-d15}, [%[bias]] \n"
+        "vmov.u32   q8,   #0  \n"
+        /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]! \n"
+        /* mov bias to c0 */
+        "vmov.u32   q5,   q7  \n"
+        "vmov.u32   q6,   q8  \n"
+        "1:\n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "vmla.f32   q5, q1, d0[0]   \n"
+        "vmla.f32   q6, q2, d0[1]   \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "bne  1b                    \n"
+        "vadd.f32   q5, q5,   q6    \n"
+        "cmp  %[relu],  #0          \n"
+        "beq  2f                    \n"
+        "vmax.f32   q5, q5,   q8    \n"
+        "2:\n"
+        "vst1.32  {d10-d11}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", 
+          "q5", "q6", "q7", "q8", "cc", "memory"
+      );
+      // clang-format on
+      b += 4;
+    }
+#endif
+    bias_ptr += 4;
+    A_packed += lda;
+  }
+}
+
+void sgemm_prepack_c4(int M,
+                      int N,
+                      int K,
+                      const float* A_packed,
+                      const float* B,
+                      float* C,
+                      const float* bias,
+                      bool has_bias,
+                      bool has_relu,
+                      ARMContext* ctx) {
+  if (N > 16) {
+    sgemm_prepack_c4_common(
+        M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx);
+  } else {
+    sgemm_prepack_c4_small(
+        M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx);
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h
new file mode 100644
index 0000000000000000000000000000000000000000..21e5af634315a7da66914bb04775088fec55550c
--- /dev/null
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+constexpr int MBLOCK_C4 = 4;
+constexpr int NBLOCK_C4 = 8;
+constexpr int KBLOCK_C4 = 4;
+
+void sgemm_prepack_c4(int M,
+                      int N,
+                      int K,
+                      const float* A_packed,
+                      const float* B,
+                      float* C,
+                      const float* bias,
+                      bool has_bias,
+                      bool has_relu,
+                      ARMContext* ctx);
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            const float* bias,
+                            bool has_bias,
+                            bool has_relu,
+                            ARMContext* ctx);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index a857e9830c54b568c93afa4c1aa119ed2baffa1e..8524d7376f2bb7e337dfc11b890c00e281d2e880 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -46,7 +46,7 @@ void pooling_basic(const float* din,
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -125,18 +125,22 @@ void pooling_basic(const float* din,
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
index 506451932dcccd98368a050484a38bc8a922eb22..1830423136cc883d30d4eecad0eb9fcfc9ded6ba 100644
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -14,6 +14,7 @@
 
 #include "lite/backends/arm/math/sgemv.h"
 #include <arm_neon.h>
+#include <algorithm>
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -50,6 +51,495 @@ void sgemv_bias_relu(const bool transA,
                      const float *x,
                      float *y,
                      const float *bias);
+#ifdef __aarch64__
+void sgemv_trans(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 bool flag_relu,
+                 const ARMContext *ctx) {
+  int m_cnt16 = M >> 4;
+  int m_cnt8 = (M & 15) >> 3;
+  int m_cnt4 = (M & 15 & 7) >> 2;
+  int m_remain = M & 15 & 7 & 3;
+  int ths = ctx->threads();
+  int valid_ths = std::min((N + 3) / 4, ths);
+  int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4);
+  valid_ths = (N + valid_block - 1) / valid_block;
+  int block_cnt = valid_block / 4;
+  float zero_buf[M];           // NOLINT
+  float y_buf[valid_ths * M];  // NOLINT
+  memset(zero_buf, 0, M * sizeof(float));
+  if (flag_bias) {
+    memcpy(y_buf, bias, M * sizeof(float));
+    memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float));
+  } else {
+    memset(y_buf, 0, valid_ths * M * sizeof(float));
+  }
+#pragma omp parallel for
+  for (int t = 0; t < valid_ths; ++t) {
+    float *block_y = y_buf + t * M;
+    const float *block_x = x + t * valid_block;
+    const float *block_A = A + t * valid_block * M;
+    for (int i = 0; i < block_cnt; ++i) {
+      float *y_ptr = block_y;
+      const float *x_ptr = block_x + i * 4;
+      const float *in0_ptr = block_A + i * 4 * M;
+      const float *in1_ptr = in0_ptr + M;
+      const float *in2_ptr = in1_ptr + M;
+      const float *in3_ptr = in2_ptr + M;
+      int offset = t * valid_block + (i + 1) * 4 - N;
+      if (offset > 0) {
+        if (offset > 3) {
+          in0_ptr = zero_buf;
+          in1_ptr = zero_buf;
+          in2_ptr = zero_buf;
+          in3_ptr = zero_buf;
+        } else {
+          switch (offset) {
+            case 3:
+              in1_ptr = zero_buf;
+            case 2:
+              in2_ptr = zero_buf;
+            case 1:
+              in3_ptr = zero_buf;
+            default:
+              break;
+          }
+        }
+      }
+      // clang-format off
+      if (m_cnt16 > 0) {
+        int cnt16 = m_cnt16;
+        asm volatile(
+            "ld1  {v4.4s},  [%[x]]    \n"                               /* load x   to v4     */
+            "ld1  {v5.4s,  v6.4s,  v7.4s,  v8.4s},   [%[in0]], #64 \n"  /* load in0 to v5,  v6,  v7,  v8  */
+            "ld1  {v9.4s,  v10.4s, v11.4s, v12.4s},  [%[in1]], #64 \n"  /* load in1 to v9,  v10, v11, v12 */
+            "ld1  {v13.4s, v14.4s, v15.4s, v16.4s},  [%[in2]], #64 \n"  /* load in2 to v13, v14, v15, v16 */
+            "ld1  {v17.4s, v18.4s, v19.4s, v20.4s},  [%[in3]], #64 \n"  /* load in3 to v17, v18, v19, v20 */
+            "1:\n"
+            "ld1  {v0.4s, v1.4s, v2.4s, v3.4s},  [%[y]]    \n"        /*load y to v0, v1, v2, v3  */
+            "fmla v0.4s,  v5.4s,  v4.s[0]     \n" /*  v0 += v5 * v4[0]  */
+            "fmla v1.4s,  v6.4s,  v4.s[0]     \n" /*  v1 += v6 * v4[0]  */
+            "fmla v2.4s,  v7.4s,  v4.s[0]     \n" /*  v2 += v7 * v4[0]  */
+            "fmla v3.4s,  v8.4s,  v4.s[0]     \n" /*  v3 += v8 * v4[0]  */
+            "ld1  {v5.4s, v6.4s,  v7.4s,  v8.4s},   [%[in0]], #64 \n" /* load in0 to v5,  v6,  v7,  v8  */
+            "fmla v0.4s,  v9.4s,  v4.s[1]     \n" /*  v0 += v9  * v4[1]  */
+            "fmla v1.4s,  v10.4s, v4.s[1]     \n" /*  v1 += v10 * v4[1]  */
+            "fmla v2.4s,  v11.4s, v4.s[1]     \n" /*  v2 += v11 * v4[1]  */
+            "fmla v3.4s,  v12.4s, v4.s[1]     \n" /*  v3 += v12 * v4[1]  */
+            "ld1  {v9.4s, v10.4s, v11.4s, v12.4s},  [%[in1]], #64 \n" /* load in1 to v9,  v10, v11, v12 */
+            "fmla v0.4s,  v13.4s, v4.s[2]     \n" /*  v0 += v13 * v4[2]  */
+            "fmla v1.4s,  v14.4s, v4.s[2]     \n" /*  v1 += v14 * v4[2]  */
+            "fmla v2.4s,  v15.4s, v4.s[2]     \n" /*  v2 += v15 * v4[2]  */
+            "fmla v3.4s,  v16.4s, v4.s[2]     \n" /*  v3 += v16 * v4[2]  */
+            "ld1  {v13.4s, v14.4s, v15.4s, v16.4s}, [%[in2]], #64 \n" /* load in2 to v13, v14, v15, v16 */
+            "fmla v0.4s,  v17.4s, v4.s[3]     \n" /*  v0 += v17 * v4[3]  */
+            "fmla v1.4s,  v18.4s, v4.s[3]     \n" /*  v1 += v18 * v4[3]  */
+            "fmla v2.4s,  v19.4s, v4.s[3]     \n" /*  v2 += v19 * v4[3]  */
+            "fmla v3.4s,  v20.4s, v4.s[3]     \n" /*  v3 += v20 * v4[3]  */
+            "ld1  {v17.4s, v18.4s, v19.4s, v20.4s}, [%[in3]], #64 \n" /* load in3 to v17, v18, v19, v20 */
+            "subs %w[cnt], %w[cnt], #1        \n" /*       sub cnt       */
+            "st1  {v0.4s, v1.4s, v2.4s, v3.4s}, [%[y]], #64   \n"     /*  store v0, v1, v2, v3 to y */
+            "bne  1b  \n"                     /*  branch to label 1 */
+            "sub  %[in0], %[in0], #64     \n" /* restore in0 address */
+            "sub  %[in1], %[in1], #64     \n" /* restore in1 address */
+            "sub  %[in2], %[in2], #64     \n" /* restore in2 address */
+            "sub  %[in3], %[in3], #64     \n" /* restore in3 address */
+            : [cnt] "+r"(cnt16),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", 
+              "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 
+              "v17", "v18", "v19", "v20", "cc", "memory"
+        );
+      }
+      if (m_cnt8 > 0) {
+        int cnt8 = m_cnt8;
+        asm volatile(
+            "ld1  {v2.4s},  [%[x]]                \n" /* load x   to v2     */
+            "ld1  {v3.4s, v4.4s},  [%[in0]], #32  \n" /* load in0 to v3, v4 */
+            "ld1  {v5.4s, v6.4s},  [%[in1]], #32  \n" /* load in1 to v5, v6 */
+            "ld1  {v7.4s, v8.4s},  [%[in2]], #32  \n" /* load in2 to v7, v8 */
+            "ld1  {v9.4s, v10.4s}, [%[in3]], #32  \n" /* load in3 to v9, v10*/
+            "1:\n"
+            "ld1  {v0.4s, v1.4s}, [%[y]]    \n" /*  load y to v0, v1  */
+            "fmla v0.4s, v3.4s,   v2.s[0]   \n" /*  v0 += v3 * v2[0]  */
+            "fmla v1.4s, v4.4s,   v2.s[0]   \n" /*  v1 += v4 * v2[0]  */
+            "prfm pldl1keep,      [%[in0]]  \n" /*    preload in0     */
+            "ld1  {v3.4s, v4.4s}, [%[in0]], #32 \n" /* load in0 to v3, v4 */
+            "fmla v0.4s, v5.4s,   v2.s[1]   \n" /*  v0 += v5 * v2[1]  */
+            "fmla v1.4s, v6.4s,   v2.s[1]   \n" /*  v1 += v6 * v2[1]  */
+            "prfm pldl1keep,      [%[in1]]  \n" /*    preload in1     */
+            "ld1  {v5.4s, v6.4s}, [%[in1]], #32 \n" /* load in0 to v5, v6 */
+            "fmla v0.4s, v7.4s,   v2.s[2]   \n" /*  v0 += v7 * v2[2]  */
+            "fmla v1.4s, v8.4s,   v2.s[2]   \n" /*  v1 += v8 * v2[2]  */
+            "prfm pldl1keep,      [%[in2]]  \n" /*    preload in2     */
+            "ld1  {v7.4s, v8.4s}, [%[in2]], #32 \n" /* load in0 to v7, v8 */
+            "fmla v0.4s, v9.4s,   v2.s[3]   \n" /*  v0 += v9 * v2[3]  */
+            "fmla v1.4s, v10.4s,  v2.s[3]   \n" /*  v1 += v10 * v2[3] */
+            "subs %w[cnt], %w[cnt], #1      \n" /*      sub cnt       */
+            "prfm pldl1keep,      [%[in3]]  \n" /*    preload in3     */
+            "st1  {v0.4s, v1.4s}, [%[y]],   #32 \n" /*  store v0, v1 to y */
+            "ld1  {v9.4s, v10.4s},[%[in3]], #32 \n" /* load in0 to v9, v10*/
+            "bne  1b  \n"                       /*  branch to label 1 */
+            "sub  %[in0], %[in0], #32     \n" /* restore in0 address */
+            "sub  %[in1], %[in1], #32     \n" /* restore in1 address */
+            "sub  %[in2], %[in2], #32     \n" /* restore in2 address */
+            "sub  %[in3], %[in3], #32     \n" /* restore in3 address */
+            : [cnt] "+r"(cnt8),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", 
+              "v7", "v8", "v9", "v10", "cc", "memory"
+        );
+      }
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "ld1  {v1.4s},  [%[in0]], #16 \n" /* load in0 to v1  */
+            "ld1  {v2.4s},  [%[in1]], #16 \n" /* load in1 to v2  */
+            "ld1  {v3.4s},  [%[in2]], #16 \n" /* load in2 to v3  */
+            "ld1  {v4.4s},  [%[in3]], #16 \n" /* load in3 to v4  */
+            "ld1  {v5.4s},  [%[x]]        \n" /* load x   to v5  */
+            "1:\n"
+            "ld1  {v0.4s},  [%[y]]        \n" /*   load y to v0    */
+            "fmla v0.4s, v1.4s, v5.s[0]   \n" /* v0 += v1 * v5[0]  */
+            "prfm  pldl1keep,   [%[in0]]  \n" /*    preload in0    */
+            "ld1  {v1.4s},  [%[in0]], #16 \n" /*  load in0 to v1   */
+            "fmla v0.4s, v2.4s, v5.s[1]   \n" /* v0 += v2 * v5[1]  */
+            "prfm  pldl1keep,  [%[in1]]   \n" /*    preload in1    */
+            "ld1  {v2.4s},  [%[in1]], #16 \n" /*  load in1 to v2   */
+            "fmla v0.4s, v3.4s, v5.s[2]   \n" /* v0 += v3 * v5[2]  */
+            "prfm pldl1keep,  [%[in2]]    \n" /*    preload in2    */
+            "ld1  {v3.4s},  [%[in2]], #16 \n" /*  load in2 to v3   */
+            "fmla v0.4s, v4.4s, v5.s[3]   \n" /* v0 += v4 * v5[3]  */
+            "subs %w[cnt], %w[cnt], #1    \n" /*      sub cnt      */
+            "prfm pldl1keep,  [%[in3]]    \n" /*    preload in3    */
+            "st1  {v0.4s},  [%[y]], #16   \n" /*  store v0 to y    */
+            "ld1  {v4.4s},  [%[in3]], #16 \n" /*  load in3 to v4   */
+            "bne  1b  \n"                     /* branch to label 1 */
+            "sub  %[in0], %[in0], #16     \n" /* restore in0 address*/
+            "sub  %[in1], %[in1], #16     \n" /* restore in1 address*/
+            "sub  %[in2], %[in2], #16     \n" /* restore in2 address*/
+            "sub  %[in3], %[in3], #16     \n" /* restore in3 address*/
+            : [cnt] "+r"(cnt4),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory"
+        );
+      }
+      // clang-format on
+      for (int r = 0; r < m_remain; ++r) {
+        float val0 = x_ptr[0] * in0_ptr[r];
+        float val1 = x_ptr[1] * in1_ptr[r];
+        float val2 = x_ptr[2] * in2_ptr[r];
+        float val3 = x_ptr[3] * in3_ptr[r];
+        y_ptr[r] += val0 + val1 + val2 + val3;
+      }
+    }
+  }
+  int cnt4 = M >> 2;
+  int remain = M & 3;
+  //! do reduction
+  int rdc_ths = valid_ths >> 1;
+  while (rdc_ths > 0) {
+#pragma omp parallel for
+    for (int t = 0; t < rdc_ths; ++t) {
+      float *y0 = y_buf + t * M;
+      for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) {
+        float *y0_ptr = y0;
+        float *y_ptr = y_buf + i * M;
+        for (int j = 0; j < cnt4; ++j) {
+          float32x4_t val0 = vld1q_f32(y0_ptr + j * 4);
+          float32x4_t val1 = vld1q_f32(y_ptr + j * 4);
+          float32x4_t val = vaddq_f32(val0, val1);
+          vst1q_f32(y0_ptr + j * 4, val);
+        }
+        y0_ptr += cnt4 * 4;
+        y_ptr += cnt4 * 4;
+        for (int j = 0; j < remain; ++j) {
+          y0_ptr[j] += y_ptr[j];
+        }
+      }
+    }
+    valid_ths = rdc_ths;
+    rdc_ths = rdc_ths >> 1;
+  }
+  if (flag_relu) {
+    float *in_y = y_buf;
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    if (cnt4 > 0) {
+      int cnt = cnt4;
+      asm volatile(
+          "ld1  {v0.4s},  [%[in_y]], #16  \n" /*  load y to v0    */
+          "1:\n"
+          "fmax v1.4s, v0.4s, %[vzero].4s \n" /*      v0 relu     */
+          "ld1  {v0.4s},  [%[in_y]], #16  \n" /*   load y to v0   */
+          "subs %w[cnt],  %w[cnt], #1     \n" /*      sub cnt     */
+          "st1  {v1.4s},  [%[out_y]], #16 \n" /*  store v1 to y   */
+          "bne  1b                        \n" /* branch to label 1*/
+          "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+          : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y)
+          : [vzero] "w"(vzero)
+          : "v0", "v1", "cc", "memory");
+    }
+    for (int r = 0; r < remain; ++r) {
+      y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+    }
+  } else {
+    memcpy(y, y_buf, M * sizeof(float));
+  }
+}
+#else
+void sgemv_trans(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 bool flag_relu,
+                 const ARMContext *ctx) {
+  int m_cnt8 = M >> 3;
+  int m_cnt4 = (M & 7) >> 2;
+  int m_remain = M & 7 & 3;
+  int ths = ctx->threads();
+  int valid_ths = std::min((N + 3) / 4, ths);
+  int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4);
+  valid_ths = (N + valid_block - 1) / valid_block;
+  int block_cnt = valid_block / 4;
+  float zero_buf[M];           // NOLINT
+  float y_buf[valid_ths * M];  // NOLINT
+  memset(zero_buf, 0, M * sizeof(float));
+  if (flag_bias) {
+    memcpy(y_buf, bias, M * sizeof(float));
+    memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float));
+  } else {
+    memset(y_buf, 0, valid_ths * M * sizeof(float));
+  }
+#pragma omp parallel for
+  for (int t = 0; t < valid_ths; ++t) {
+    float *block_y = y_buf + t * M;
+    const float *block_x = x + t * valid_block;
+    const float *block_A = A + t * valid_block * M;
+    for (int i = 0; i < block_cnt; ++i) {
+      float *y_ptr = block_y;
+      const float *x_ptr = block_x + i * 4;
+      const float *in0_ptr = block_A + i * 4 * M;
+      const float *in1_ptr = in0_ptr + M;
+      const float *in2_ptr = in1_ptr + M;
+      const float *in3_ptr = in2_ptr + M;
+      int offset = t * valid_block + (i + 1) * 4 - N;
+      if (offset > 0) {
+        if (offset > 3) {
+          in0_ptr = zero_buf;
+          in1_ptr = zero_buf;
+          in2_ptr = zero_buf;
+          in3_ptr = zero_buf;
+        } else {
+          switch (offset) {
+            case 3:
+              in1_ptr = zero_buf;
+            case 2:
+              in2_ptr = zero_buf;
+            case 1:
+              in3_ptr = zero_buf;
+            default:
+              break;
+          }
+        }
+      }
+      // clang-format off
+      if (m_cnt8 > 0) {
+        int cnt8 = m_cnt8;
+        asm volatile(
+            "vld1.32  {d4-d5},  [%[x]]    \n" /* load x   to q2     */
+            "vld1.32  {d6-d9},  [%[in0]]! \n" /* load in0 to q3, q4 */
+            "vld1.32  {d10-d13},[%[in1]]! \n" /* load in1 to q5, q6 */
+            "vld1.32  {d14-d17},[%[in2]]! \n" /* load in2 to q7, q8 */
+            "vld1.32  {d18-d21},[%[in3]]! \n" /* load in3 to q9, q10*/
+            "1:\n"
+            "vld1.32  {d0-d3},  [%[y]]    \n" /*  load y to q0, q1  */
+            "vmla.f32 q0, q3,   d4[0]     \n" /*  q0 += q3 * q2[0]  */
+            "vmla.f32 q1, q4,   d4[0]     \n" /*  q1 += q4 * q2[0]  */
+            "pld  [%[in0]]                \n" /*    preload in0     */
+            "vld1.32  {d6-d9},  [%[in0]]! \n" /* load in0 to q3, q4 */
+            "vmla.f32 q0, q5,   d4[1]     \n" /*  q0 += q5 * q2[1]  */
+            "vmla.f32 q1, q6,   d4[1]     \n" /*  q1 += q6 * q2[1]  */
+            "pld  [%[in1]]                \n" /*    preload in1     */
+            "vld1.32  {d10-d13},[%[in1]]! \n" /* load in0 to q5, q6 */
+            "vmla.f32 q0, q7,   d5[0]     \n" /*  q0 += q7 * q2[2]  */
+            "vmla.f32 q1, q8,   d5[0]     \n" /*  q1 += q8 * q2[2]  */
+            "pld  [%[in2]]                \n" /*    preload in2     */
+            "vld1.32  {d14-d17},[%[in2]]! \n" /* load in0 to q7, q8 */
+            "vmla.f32 q0, q9,   d5[1]     \n" /*  q0 += q9 * q2[3]  */
+            "vmla.f32 q1, q10,  d5[1]     \n" /*  q1 += q10 * q2[3] */
+            "subs %[cnt], %[cnt], #1      \n" /*      sub cnt       */
+            "pld  [%[in3]]                \n" /*    preload in3     */
+            "vst1.32  {d0-d3},  [%[y]]!   \n" /*  store q0, q1 to y */
+            "vld1.32  {d18-d21},[%[in3]]! \n" /* load in0 to q9, q10*/
+            "pld  [%[y], #32] \n"             /*     preload y      */
+            "bne  1b  \n"                     /*  branch to label 1 */
+            "sub  %[in0], %[in0], #32     \n" /* restore in0 address */
+            "sub  %[in1], %[in1], #32     \n" /* restore in1 address */
+            "sub  %[in2], %[in2], #32     \n" /* restore in2 address */
+            "sub  %[in3], %[in3], #32     \n" /* restore in3 address */
+            : [cnt] "+r"(cnt8),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", 
+              "q7", "q8", "q9", "q10", "cc", "memory"
+        );
+      }
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "vld1.32  {d2-d3},  [%[in0]]! \n" /* load in0 to q1  */
+            "vld1.32  {d4-d5},  [%[in1]]! \n" /* load in1 to q2  */
+            "vld1.32  {d6-d7},  [%[in2]]! \n" /* load in2 to q3  */
+            "vld1.32  {d8-d9},  [%[in3]]! \n" /* load in3 to q4  */
+            "vld1.32  {d10-d11},[%[x]]    \n" /* load x   to q5  */
+            "1:\n"
+            "vld1.32  {d0-d1},  [%[y]]    \n" /*   load y to q0    */
+            "vmla.f32 q0, q1,   d10[0]    \n" /* q0 += q1 * q5[0]  */
+            "pld  [%[in0]]                \n" /*    preload in0    */
+            "vld1.32  {d2-d3},  [%[in0]]! \n" /*  load in0 to q1   */
+            "vmla.f32 q0, q2,   d10[1]    \n" /* q0 += q2 * q5[1]  */
+            "pld  [%[in1]]                \n" /*    preload in1    */
+            "vld1.32  {d4-d5},  [%[in1]]! \n" /*  load in0 to q2   */
+            "vmla.f32 q0, q3,   d11[0]    \n" /* q0 += q3 * q5[2]  */
+            "pld  [%[in2]]                \n" /*    preload in2    */
+            "vld1.32  {d6-d7},  [%[in2]]! \n" /*  load in0 to q3   */
+            "vmla.f32 q0, q4,   d11[1]    \n" /* q0 += q4 * q5[3]  */
+            "subs %[cnt], %[cnt], #1      \n" /*      sub cnt      */
+            "pld  [%[in3]]                \n" /*    preload in3    */
+            "vst1.32  {d0-d1},  [%[y]]!   \n" /*  store q0 to y    */
+            "vld1.32  {d8-d9},  [%[in3]]! \n" /*  load in0 to q4   */
+            "bne  1b  \n"                     /*  branch to label 1 */
+            "sub  %[in0], %[in0], #16     \n" /* restore in0 address*/
+            "sub  %[in1], %[in1], #16     \n" /* restore in1 address*/
+            "sub  %[in2], %[in2], #16     \n" /* restore in2 address*/
+            "sub  %[in3], %[in3], #16     \n" /* restore in3 address*/
+            : [cnt] "+r"(cnt4),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "cc", "memory"
+        );
+      }
+      // clang-format on
+      for (int r = 0; r < m_remain; ++r) {
+        float val0 = x_ptr[0] * in0_ptr[r];
+        float val1 = x_ptr[1] * in1_ptr[r];
+        float val2 = x_ptr[2] * in2_ptr[r];
+        float val3 = x_ptr[3] * in3_ptr[r];
+        y_ptr[r] += val0 + val1 + val2 + val3;
+      }
+    }
+  }
+  //! do reduction
+  int rdc_ths = valid_ths >> 1;
+  while (rdc_ths > 0) {
+#pragma omp parallel for
+    for (int t = 0; t < rdc_ths; ++t) {
+      float *y0 = y_buf + t * M;
+      for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) {
+        float *y0_ptr = y0;
+        float *y_ptr = y_buf + i * M;
+        for (int j = 0; j < m_cnt8; ++j) {
+          float32x4_t val00 = vld1q_f32(y0_ptr + j * 8);
+          float32x4_t val01 = vld1q_f32(y0_ptr + j * 8 + 4);
+          float32x4_t val10 = vld1q_f32(y_ptr + j * 8);
+          float32x4_t val11 = vld1q_f32(y_ptr + j * 8 + 4);
+          float32x4_t val0 = vaddq_f32(val00, val10);
+          float32x4_t val1 = vaddq_f32(val01, val11);
+          vst1q_f32(y0_ptr + j * 8, val0);
+          vst1q_f32(y0_ptr + j * 8 + 4, val1);
+        }
+        y0_ptr += m_cnt8 * 8;
+        y_ptr += m_cnt8 * 8;
+        for (int j = 0; j < m_cnt4; ++j) {
+          float32x4_t val0 = vld1q_f32(y0_ptr + j * 4);
+          float32x4_t val1 = vld1q_f32(y_ptr + j * 4);
+          float32x4_t val = vaddq_f32(val0, val1);
+          vst1q_f32(y0_ptr + j * 4, val);
+        }
+        y0_ptr += m_cnt4 * 4;
+        y_ptr += m_cnt4 * 4;
+        for (int j = 0; j < m_remain; ++j) {
+          y0_ptr[j] += y_ptr[j];
+        }
+      }
+    }
+    valid_ths = rdc_ths;
+    rdc_ths = rdc_ths >> 1;
+  }
+  if (flag_relu) {
+    float *in_y = y_buf;
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    if (m_cnt8 > 0) {
+      int cnt8 = m_cnt8;
+      asm volatile(
+          "vld1.32  {d0-d3},  [%[in_y]]!  \n" /* load y to q0, q1 */
+          "1:\n"
+          "vmax.f32 q2, q0,   %q[vzero]   \n" /*      q0 relu     */
+          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
+          "vmax.f32 q3, q1,   %q[vzero]   \n" /*      q1 relu     */
+          "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+          "vst1.32  {d4-d7},  [%[out_y]]! \n" /* store q0, q1 to y*/
+          "vld1.32  {d2-d3},  [%[in_y]]!  \n" /*   load y to q0   */
+          "bne  1b                        \n" /* branch to label 1*/
+          "sub  %[in_y],  %[in_y],  #32   \n" /*   restore in_y   */
+          : [cnt] "+r"(cnt8), [in_y] "+r"(in_y), [out_y] "+r"(y)
+          : [vzero] "w"(vzero)
+          : "q0", "q1", "q2", "q3", "cc", "memory");
+    }
+    if (m_cnt4 > 0) {
+      int cnt4 = m_cnt4;
+      asm volatile(
+          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*  load y to q0    */
+          "1:\n"
+          "vmax.f32 q1, q0,   %q[vzero]   \n" /*      q0 relu     */
+          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
+          "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+          "vst1.32  {d2-d3},  [%[out_y]]! \n" /*  store q1 to y   */
+          "bne  1b                        \n" /* branch to label 1*/
+          "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+          : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y)
+          : [vzero] "w"(vzero)
+          : "q0", "q1", "cc", "memory");
+    }
+    for (int r = 0; r < m_remain; ++r) {
+      y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+    }
+  } else {
+    memcpy(y, y_buf, M * sizeof(float));
+  }
+}
+#endif  // __aarch64__
 
 bool sgemv(const float *A,
            const float *x,
@@ -59,33 +549,34 @@ bool sgemv(const float *A,
            int N,
            bool is_bias,
            const float *bias,
-           bool is_relu) {
+           bool is_relu,
+           const ARMContext *ctx) {
   if (transA) {
-    LOG(ERROR) << " sgemv, transA is not supported now";
-    return false;
-  }
-  if (is_bias) {
-    //! with bias
-    if (is_relu) {
-      //! with relu
-      sgemv_bias_relu(transA, M, N, A, x, y, bias);
-    } else {
-      //! without relu
-      sgemv_bias(transA, M, N, A, x, y, bias);
-    }
+    sgemv_trans(M, N, A, x, y, is_bias, bias, is_relu, ctx);
   } else {
-    //! without bias
-    if (is_relu) {
-      //! with relu
-      sgemv_relu(transA, M, N, A, x, y);
+    if (is_bias) {
+      //! with bias
+      if (is_relu) {
+        //! with relu
+        sgemv_bias_relu(transA, M, N, A, x, y, bias);
+      } else {
+        //! without relu
+        sgemv_bias(transA, M, N, A, x, y, bias);
+      }
     } else {
-      //! without relu
-      sgemv(transA, M, N, A, x, y);
+      //! without bias
+      if (is_relu) {
+        //! with relu
+        sgemv_relu(transA, M, N, A, x, y);
+      } else {
+        //! without relu
+        sgemv(transA, M, N, A, x, y);
+      }
     }
   }
   return true;
 }
-
+// clang-format off
 //! define compute kernel
 #ifdef __aarch64__
 #define SGEMV_IN_8                                    \
@@ -179,8 +670,8 @@ bool sgemv(const float *A,
   "fmla v5.4s, v9.4s, v21.4s  \n"                    /* mul + add*/            \
   "fmla v6.4s, v9.4s, v23.4s  \n"                    /* mul + add*/            \
   "fmla v7.4s, v9.4s, v25.4s  \n"                    /* mul + add*/            \
-  "bne 1b                     \n" /* jump to main loop */ /* pair add to final \
-                                                             result */         \
+  "bne 1b                     \n" /* jump to main loop */                      \
+  /* pair add to final result */                                               \
   "2:                         \n"  /* reduce to scale */                       \
   "faddp  v16.4s, v0.4s, v0.4s\n"  /* pair add to vector */                    \
   "faddp  s8, v16.2s          \n"  /* pair add to scale */                     \
@@ -231,8 +722,8 @@ bool sgemv(const float *A,
   "fmla v0.4s, v8.4s, v10.4s  \n"                    /* mul + add*/            \
   "subs %w[cnt], %w[cnt], #1  \n"                    /* sub main loop count */ \
   "fmla v1.4s, v9.4s, v11.4s  \n"                    /* mul + add*/            \
-  "bne 1b                     \n" /* jump to main loop */ /* pair add to final \
-                                                             result */         \
+  "bne 1b                     \n" /* jump to main loop */                      \
+  /* pair add to final result */                                               \
   "2:                         \n" /* reduce to scale */                        \
   "fadd   v9.4s, v0.4s, v1.4s \n" /* add 2 vector */                           \
   "faddp  v10.4s, v9.4s, v9.4s\n" /* pair add to vector */                     \
@@ -283,7 +774,7 @@ bool sgemv(const float *A,
   "fmax   s8, s8, s0          \n" /* relu */               \
   "str s8, [%[out]]           \n" /* save result */
 
-#else  //__aarch64__
+#else  // __aarch64__
 
 #define SGEMV_IN_4                                                    \
   "pld [%[in]]                    @ preload cache line, input\n"      \
@@ -349,8 +840,8 @@ bool sgemv(const float *A,
   "vmla.f32 q1, q5, q9            @ mul add\n"                                 \
   "vmla.f32 q2, q5, q11           @ mul add\n"                                 \
   "vmla.f32 q3, q5, q13           @ mul add\n"                                 \
-  "bne 1b                         @ jump to main loop\n" /* pair add to final  \
-                                                            result */          \
+  "bne 1b                         @ jump to main loop\n"                       \
+  /* pair add to final result */                                               \
   "2:                             @ pair add \n"                               \
   "vpadd.f32 d8, d0, d1           @ pair add, first step\n"                    \
   "vpadd.f32 d9, d2, d3           @ pair add, first step\n"                    \
@@ -382,13 +873,10 @@ bool sgemv(const float *A,
   "vmla.f32 q0, q12, q14              @ mul add\n"                             \
   "vmla.f32 q0, q13, q15              @ mul add\n"                             \
   "subs %[cnt] , #1                   @ sub loop count \n"                     \
-  "bne 1b                             @ jump to main loop\n" /* pair add to    \
-                                                                final result   \
-                                                                */             \
+  "bne 1b                             @ jump to main loop\n"                   \
   "2:                                 @ end processing\n"                      \
   "vpadd.f32 d2, d0, d1               @ pair add, first step\n"                \
-  "vpadd.f32 d0, d2, d2               @ pair add, final step\n" /* check tails \
-                                                                   */          \
+  "vpadd.f32 d0, d2, d2               @ pair add, final step\n"/*check tails*/ \
   "cmp %[tail], #1                    @ check whether has mid cols\n"          \
   "blt  4f                            @ jump to end\n"                         \
   "3:                                 @ tail loop\n"                           \
@@ -422,7 +910,7 @@ bool sgemv(const float *A,
   "vmax.f32   d0, d0, d1          @ relu\n"          \
   "vst1.32 {d0[0]}, [%[out]]      @ save result\n"
 #endif
-
+// clang-format on
 void sgemv(const bool transA,
            const int M,
            const int N,
@@ -523,7 +1011,7 @@ void sgemv(const bool transA,
           [tmp4] "r"(tmp4)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -579,7 +1067,7 @@ void sgemv(const bool transA,
                  : [out] "r"(ptr_out)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 void sgemv_relu(const bool transA,
@@ -671,7 +1159,7 @@ void sgemv_relu(const bool transA,
         : [out] "r"(ptr_out)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -727,7 +1215,7 @@ void sgemv_relu(const bool transA,
                  : [out] "r"(ptr_out)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 void sgemv_bias(const bool transA,
@@ -822,7 +1310,7 @@ void sgemv_bias(const bool transA,
         : [out] "r"(ptr_out), [bias0] "r"(bias0)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -887,7 +1375,7 @@ void sgemv_bias(const bool transA,
                  : [out] "r"(ptr_out), [bias0] "r"(bias0)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 void sgemv_bias_relu(const bool transA,
@@ -980,7 +1468,7 @@ void sgemv_bias_relu(const bool transA,
         : [out] "r"(ptr_out), [bias0] "r"(bias0)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -1045,7 +1533,7 @@ void sgemv_bias_relu(const bool transA,
                  : [out] "r"(ptr_out), [bias0] "r"(bias0)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 }  // namespace math
diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h
index 4d74006f9320ee770bc4f57a52a58df3bce4db9e..aa17349c99e61f7135090318be829149ecd6bb57 100644
--- a/lite/backends/arm/math/sgemv.h
+++ b/lite/backends/arm/math/sgemv.h
@@ -15,6 +15,8 @@
 #pragma once
 
 #include <cmath>
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
 
 namespace paddle {
 namespace lite {
@@ -28,9 +30,10 @@ bool sgemv(const float* A,
            bool transA,
            int M,
            int N,
-           bool is_bias = false,
-           const float* bias = nullptr,
-           bool is_relu = false);
+           bool is_bias,
+           const float* bias,
+           bool is_relu,
+           const ARMContext* ctx);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt
index a6c3fcc66a789f159cd3a756ed893627b393e1fe..f73b4120e6a48bfdec04d0706a47bcc4a54fcf5e 100644
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
@@ -1,8 +1,7 @@
 if(NOT LITE_WITH_CUDA)
     return()
 endif()
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
 
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
 nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h
index 13bf8190efe1592e7509039a569d31f6bddc5b66..9da70262f5b2e32ae8509d9370142b2499886bfb 100644
--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -56,6 +56,15 @@
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \
   }
 
+const int CUDA_NUM_THREADS = 512;
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+inline int CUDA_GET_BLOCKS(const int N, const int base) {
+  return (N + base - 1) / base;
+}
+
 namespace paddle {
 namespace lite {
 namespace cuda {
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index a5ee25643b4c87c9488df5b2acaead26773855a9..fafd74ae7a43d1a769456edfe408c71593d21201 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -2,8 +2,7 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
 
 nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
 nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps})
@@ -12,6 +11,9 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
 cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
+nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
+nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
+nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 
 set (
  math_cuda
@@ -21,6 +23,9 @@ set (
  cuda_type_trans
  cuda_transpose
  cuda_elementwise
+ cudnn_pool
+ cuda_gemm
+ cuda_batched_gemm
 )
 
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
diff --git a/lite/backends/cuda/math/batched_gemm.cc b/lite/backends/cuda/math/batched_gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e81510927615daa88e7f5bef3ce7b8421d8f6539
--- /dev/null
+++ b/lite/backends/cuda/math/batched_gemm.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/batched_gemm.h"
+#include <iostream>
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <>
+bool BatchedGemm<float, float>::init(const bool trans_a,
+                                     const bool trans_b,
+                                     const int max_batch_size,
+                                     Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cudaMalloc(reinterpret_cast<void **>(&A_),
+             3 * max_batch_size * sizeof(float *));
+  return true;
+}
+
+template <>
+bool BatchedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const float *a[],
+                                    const float *b[],
+                                    float *c[],
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const int batch_size) {
+  CHECK(a != nullptr);
+  CHECK(b != nullptr);
+  CHECK(c != nullptr);
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cudaMemcpyAsync(A_,
+                  a,
+                  batch_size * sizeof(const float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  cudaMemcpyAsync(A_ + batch_size,
+                  b,
+                  batch_size * sizeof(const float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  cudaMemcpyAsync(A_ + batch_size * 2,
+                  c,
+                  batch_size * sizeof(float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
+                                 cu_trans_b_,
+                                 cu_trans_a_,
+                                 n_,
+                                 m_,
+                                 k_,
+                                 &alpha,
+                                 const_cast<const float **>(A_ + batch_size),
+                                 ldb_,
+                                 const_cast<const float **>(A_),
+                                 lda_,
+                                 &beta,
+                                 A_ + batch_size * 2,
+                                 ldc_,
+                                 batch_size));
+  return true;
+}
+
+template <>
+bool BatchedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const float *a[],
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const int batch_size) {
+  CHECK(a != nullptr);
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cudaMemcpyAsync(A_,
+                  a,
+                  3 * batch_size * sizeof(const float *),
+                  cudaMemcpyDefault,
+                  exe_stream_);
+  CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
+                                 cu_trans_b_,
+                                 cu_trans_a_,
+                                 n_,
+                                 m_,
+                                 k_,
+                                 &alpha,
+                                 const_cast<const float **>(A_ + batch_size),
+                                 ldb_,
+                                 const_cast<const float **>(A_),
+                                 lda_,
+                                 &beta,
+                                 A_ + batch_size * 2,
+                                 ldc_,
+                                 batch_size));
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/batched_gemm.h b/lite/backends/cuda/math/batched_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b91d3a524596bf03b4a26a81c14eddcfe64452f
--- /dev/null
+++ b/lite/backends/cuda/math/batched_gemm.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class BatchedGemm {
+ public:
+  BatchedGemm() : cu_handle_(nullptr) {}
+  ~BatchedGemm() {
+    if (A_ != nullptr) {
+      cudaFree(A_);
+    }
+  }
+
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int max_batch_size,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a[],
+           const PtypeIn* b[],
+           PtypeOut* c[],
+           const int m,
+           const int n,
+           const int k,
+           const int batch_size);
+
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a[],
+           const int m,
+           const int n,
+           const int k,
+           const int batch_size);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+  PtypeIn** A_{nullptr};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index 72ed3951f6b9b22a5ae1ee6caef8c69708102885..a4f33f467feb8626696595e95a29fde7b636919d 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -31,6 +31,9 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
   auto o_dims = param.output->dims();
   int batch = x_dims[0];
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int iw = x_dims[3];  // nchw
   int ih = x_dims[2];
   int ic = x_dims[1];
@@ -41,10 +44,10 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
   int kh = w_dims[2];
   int sw = param.strides[1];
   int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
+  int pw = paddings[2];
+  int ph = paddings[0];
+  int dw = dilations[1];
+  int dh = dilations[0];
 
   CHECK(ic % param.groups == 0)
       << "The conv input channel shoud be divide group number.";
@@ -133,8 +136,8 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
     this->fwd_algo_ = algo_cache.GetAlgorithm(x_dims.Vectorize(),
                                               w_dims.Vectorize(),
                                               param.strides,
-                                              param.paddings,
-                                              param.dilations,
+                                              *param.paddings,
+                                              *param.dilations,
                                               0,
                                               search_func);
 
@@ -311,12 +314,15 @@ bool CudnnConv2DInt8<Ptype_out>::create(const operators::ConvParam& param,
   int kw = w_dims[2];
   int kh = w_dims[1];
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int sw = param.strides[1];
   int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
+  int pw = paddings[2];
+  int ph = paddings[0];
+  int dw = dilations[1];
+  int dh = dilations[0];
 
   std::vector<float> weight_scale = param.weight_scale;
   float input_scale = param.input_scale;
diff --git a/lite/backends/cuda/math/cudnn_pool.cc b/lite/backends/cuda/math/cudnn_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f970fc326b29c4c226e7dc9643e416a3cf24f0eb
--- /dev/null
+++ b/lite/backends/cuda/math/cudnn_pool.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/cudnn_pool.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/backends/cuda/math/scale.h"
+#include "lite/backends/cuda/math/type_trans.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+inline void UpdatePadding(std::vector<int>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::vector<int>& data_dims,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& ksize) {
+  if (paddings->size() == data_dims.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    CHECK(data_dims.size() * 2 == paddings->size())
+        << "Paddings size should be the same or twice as the pooling size.";
+  }
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline void UpdateKsize(std::vector<int>* ksize,
+                        const std::vector<int>& data_dims) {
+  ksize->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < ksize->size(); ++i) {
+    *(ksize->begin() + i) = static_cast<int>(data_dims[i]);
+  }
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::create(
+    const operators::PoolParam& param, Context<TARGET(kCUDA)>* ctx) {
+  return true;
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::init(const operators::PoolParam& param,
+                                              Context<TARGET(kCUDA)>* ctx) {
+  this->stream_ = ctx->exec_stream();
+  CUDNN_CHECK(cudnnCreate(&this->handle_));
+  CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_));
+
+  cudnnCreateTensorDescriptor(&this->input_desc_);
+  cudnnCreateTensorDescriptor(&this->output_desc_);
+  cudnnCreatePoolingDescriptor(&this->pooling_desc_);
+
+  return create(param, ctx);
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::run(
+    const operators::PoolParam& param) {
+  auto x_dims = param.x->dims();
+  auto o_dims = param.output->dims();
+  int batch = x_dims[0];
+  const float* in_data = param.x->data<float>();
+  float* out_data = param.output->mutable_data<float>(TARGET(kCUDA));
+
+  int ih = x_dims[1];
+  int iw = x_dims[2];  // nchw
+  int ic = x_dims[3];
+
+  int oh = o_dims[1];
+  int ow = o_dims[2];
+  int oc = o_dims[3];
+
+  std::vector<int> ksize = param.ksize;
+  std::vector<int> strides = param.strides;
+  std::vector<int> paddings = *(param.paddings.get());
+
+  std::string pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+
+  std::vector<int> data_dims = {ih, iw};
+  UpdatePadding(&paddings, global_pooling, adaptive, data_dims, strides, ksize);
+
+  if (data_dims.size() * 2 == paddings.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    UpdateKsize(&ksize, data_dims);
+  }
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
+                                         CUDNN_TENSOR_NHWC,
+                                         CUDNN_DATA_FLOAT,
+                                         batch,
+                                         ic,
+                                         ih,
+                                         iw));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
+                                         CUDNN_TENSOR_NHWC,
+                                         CUDNN_DATA_FLOAT,
+                                         batch,
+                                         oc,
+                                         oh,
+                                         ow));
+  cudnnPoolingMode_t mode;
+  if (pooling_type == "max") {
+    mode = CUDNN_POOLING_MAX;
+  } else {
+    mode = exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+                     : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+  }
+  CUDNN_CHECK(cudnnSetPoolingNdDescriptor(this->pooling_desc_,
+                                          mode,
+                                          CUDNN_NOT_PROPAGATE_NAN,
+                                          ksize.size(),
+                                          ksize.data(),
+                                          paddings.data(),
+                                          strides.data()));
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  CUDNN_CHECK(cudnnPoolingForward(this->handle_,
+                                  this->pooling_desc_,
+                                  &alpha,
+                                  this->input_desc_,
+                                  in_data,
+                                  &beta,
+                                  this->output_desc_,
+                                  out_data));
+
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_pool.h b/lite/backends/cuda/math/cudnn_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..acdc695b500ab41d615cb98c9501efd729c2fe6a
--- /dev/null
+++ b/lite/backends/cuda/math/cudnn_pool.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <PrecisionType Ptype_out>
+class CudnnPool2DBase {
+ public:
+  CudnnPool2DBase()
+      : handle_(NULL),
+        input_desc_(NULL),
+        output_desc_(NULL),
+        pooling_desc_(NULL) {}
+
+  ~CudnnPool2DBase() {
+    if (handle_ != NULL) {
+      CUDNN_CHECK(cudnnDestroy(handle_));
+    }
+    if (input_desc_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
+    }
+    if (output_desc_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_));
+    }
+    if (pooling_desc_) {
+      cudnnDestroyPoolingDescriptor(pooling_desc_);
+    }
+  }
+
+ protected:
+  cudaStream_t stream_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t input_desc_;
+  cudnnTensorDescriptor_t output_desc_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+};
+
+template <PrecisionType Ptype_out>
+class CudnnPool2DNHWC : public CudnnPool2DBase<Ptype_out> {
+ public:
+  CudnnPool2DNHWC() : CudnnPool2DBase<Ptype_out>() {}
+  virtual ~CudnnPool2DNHWC() = default;
+  virtual bool init(const operators::PoolParam& param,
+                    Context<TARGET(kCUDA)>* ctx);
+
+  virtual bool create(const operators::PoolParam& param,
+                      Context<TARGET(kCUDA)>* ctx);
+
+  virtual bool run(const operators::PoolParam& param);
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/elementwise.cu b/lite/backends/cuda/math/elementwise.cu
index 57c9ec022a6e49551fd2d56a9b2036de13bf5a2c..8f0ebd1f97a03f03b568de694b986e9540f07c55 100644
--- a/lite/backends/cuda/math/elementwise.cu
+++ b/lite/backends/cuda/math/elementwise.cu
@@ -13,13 +13,55 @@
 // limitations under the License.
 
 #include "lite/backends/cuda/math/elementwise.h"
-#include "lite/backends/cuda/math/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+template <typename Dtype>
+__global__ void elementwise_kernel(const size_t total,
+                                   const Dtype* x_data,
+                                   const Dtype* y_data,
+                                   Dtype* out_data,
+                                   int pre,
+                                   int n,
+                                   int post,
+                                   BinaryOperation type) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total) {
+    int idx = tid / post % n;
+#if __CUDA_ARCH__ >= 350
+    out_data[tid] = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type);
+#else
+    out_data[tid] = binary_calc(x_data[tid], y_data[idx], type);
+#endif
+  }
+}
+
+template <typename Dtype>
+__global__ void elementwise_relu_kernel(const size_t total,
+                                        const Dtype* x_data,
+                                        const Dtype* y_data,
+                                        Dtype* out_data,
+                                        int pre,
+                                        int n,
+                                        int post,
+                                        BinaryOperation type) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total) {
+    int idx = tid / post % n;
+    Dtype temp;
+#if __CUDA_ARCH__ >= 350
+    temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type);
+
+#else
+    temp = binary_calc(x_data[tid], y_data[idx], type);
+#endif
+    out_data[tid] = temp > 0 ? temp : 0;
+  }
+}
+
 template <typename Dtype>
 __global__ void elementwise_add_kernel(const size_t total,
                                        const Dtype* x_data,
@@ -76,6 +118,56 @@ __global__ void elementwise_add_nhwc4_int8_kernel(const size_t total,
   }
 }
 
+template <typename Dtype>
+void elementwise(const Dtype* x_data,
+                 const Dtype* y_data,
+                 Dtype* out_data,
+                 int pre,
+                 int n,
+                 int post,
+                 BinaryOperation type,
+                 cudaStream_t stream) {
+  int num = pre * n * post;
+  int thread = 256;
+  int block = (num + thread - 1) / thread;
+  elementwise_kernel<<<block, thread, 0, stream>>>(
+      num, x_data, y_data, out_data, pre, n, post, type);
+}
+
+template <typename Dtype>
+void elementwise_relu(const Dtype* x_data,
+                      const Dtype* y_data,
+                      Dtype* out_data,
+                      int pre,
+                      int n,
+                      int post,
+                      BinaryOperation type,
+                      cudaStream_t stream) {
+  int num = pre * n * post;
+  int thread = 256;
+  int block = (num + thread - 1) / thread;
+  elementwise_relu_kernel<<<block, thread, 0, stream>>>(
+      num, x_data, y_data, out_data, pre, n, post, type);
+}
+
+template void elementwise(const float*,
+                          const float*,
+                          float*,
+                          int,
+                          int,
+                          int,
+                          BinaryOperation,
+                          cudaStream_t);
+
+template void elementwise_relu(const float*,
+                               const float*,
+                               float*,
+                               int,
+                               int,
+                               int,
+                               BinaryOperation,
+                               cudaStream_t);
+
 template <typename Dtype>
 void elementwise_add(int num,
                      const Dtype* x_data,
diff --git a/lite/backends/cuda/math/elementwise.h b/lite/backends/cuda/math/elementwise.h
index 7fcdf95021ff21379bf94298ed06328dd6d2db09..ce45d0544e5a55a9cdc34bdfacc2b48157f5a198 100644
--- a/lite/backends/cuda/math/elementwise.h
+++ b/lite/backends/cuda/math/elementwise.h
@@ -15,12 +15,33 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/math/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+template <typename Dtype>
+void elementwise(const Dtype* x_data,
+                 const Dtype* y_data,
+                 Dtype* out_data,
+                 int pre,
+                 int n,
+                 int post,
+                 BinaryOperation type,
+                 cudaStream_t stream);
+
+template <typename Dtype>
+void elementwise_relu(const Dtype* x_data,
+                      const Dtype* y_data,
+                      Dtype* out_data,
+                      int pre,
+                      int n,
+                      int post,
+                      BinaryOperation type,
+                      cudaStream_t stream);
+
 template <typename Dtype>
 void elementwise_add(int num,
                      const Dtype* x_data,
diff --git a/lite/backends/cuda/math/gemm.cc b/lite/backends/cuda/math/gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9f12984aa5cddfc0acb24de1ebd66735c5d498e
--- /dev/null
+++ b/lite/backends/cuda/math/gemm.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/gemm.h"
+#include <iostream>
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <>
+bool Gemm<float, float>::init(const bool trans_a,
+                              bool trans_b,
+                              const int m,
+                              const int n,
+                              const int k,
+                              Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  lda_ = (!trans_a) ? k : m;
+  ldb_ = (!trans_b) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool Gemm<float, float>::init(const bool trans_a,
+                              bool trans_b,
+                              const int m,
+                              const int n,
+                              const int k,
+                              const int lda,
+                              const int ldb,
+                              const int ldc,
+                              Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  lda_ = lda;
+  ldb_ = ldb;
+  ldc_ = ldc;
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool Gemm<float, float>::run(const float alpha,
+                             const float beta,
+                             const float *a,
+                             const float *b,
+                             float *c,
+                             Context<TARGET(kCUDA)> *ctx) {
+  CUBLAS_CALL(cublasSgemm(cu_handle_,
+                          cu_trans_b_,
+                          cu_trans_a_,
+                          n_,
+                          m_,
+                          k_,
+                          &alpha,
+                          b,
+                          ldb_,
+                          a,
+                          lda_,
+                          &beta,
+                          c,
+                          ldc_));
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/gemm.h b/lite/backends/cuda/math/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..12194d54b08a533a3812e10b5d2f78134c19da24
--- /dev/null
+++ b/lite/backends/cuda/math/gemm.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class Gemm {
+ public:
+  Gemm() : cu_handle_(nullptr) {}
+  ~Gemm() {}
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int m,
+            const int n,
+            const int k,
+            Context<TARGET(kCUDA)>* ctx);
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int m,
+            const int n,
+            const int k,
+            const int lda,
+            const int ldb,
+            const int ldc,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a,
+           const PtypeIn* b,
+           PtypeOut* c,
+           Context<TARGET(kCUDA)>* ctx);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h
index b4cd82fd8df6df063d92df709311f3c90e7cf4b6..b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff 100644
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
@@ -25,6 +25,24 @@ namespace lite {
 namespace cuda {
 namespace math {
 
+enum class BinaryOperation {
+  kADD = 0,
+  kMUL = 1,
+  kDIV = 2,
+};
+
+template <typename T>
+__device__ T binary_calc(T x, T y, BinaryOperation type);
+
+template <>
+__device__ __forceinline__ float binary_calc(float x,
+                                             float y,
+                                             BinaryOperation type) {
+  if (type == BinaryOperation::kADD) return x + y;
+  if (type == BinaryOperation::kMUL) return x * y;
+  if (type == BinaryOperation::kDIV) return x / y;
+}
+
 template <typename T>
 __device__ T from_float(float x);
 
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
index fd17218d06f050df3dc935bdde0a320e52b56a40..23332b422df65250f8cadf07f5e0d95e970d316a 100644
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -294,10 +294,17 @@ inline void split_filter_num(const ConvParam& c_param) {
     args.image.channels = input->shape().channel();
     args.image.width = input->shape().width();
     args.image.height = input->shape().height();
-    args.image.pad_width = param.paddings[1];
+    auto paddings = *param.padding;
+    args.image.pad_width = param.paddings[2];
     args.image.pad_height = param.paddings[0];
     args.output.address = out_address;
     args.output.scale_address = out_scale_address;
+    bool pad_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    if (!pad_equal) {
+      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
+                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+    }
     param.splitParams().push_back(conv_param);
   }
 }
@@ -372,10 +379,18 @@ inline void split_channel(const ConvParam& c_param) {
     args.image.channels = conv_param->input.shape().channel();
     args.image.width = conv_param->input.shape().width();
     args.image.height = conv_param->input.shape().height();
-    args.image.pad_width = param.paddings[1];
-    args.image.pad_height = param.paddings[0];
+    auto paddings = *param.paddings;
+    args.image.pad_width = paddings[2];
+    args.image.pad_height = paddings[0];
+
     args.output.address = conv_param->output.mutableData<void>();
     args.output.scale_address = conv_param->output.scale();
+    bool pad_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    if (!pad_equal) {
+      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
+                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+    }
     param.splitParams().push_back(conv_param);
   }
 }
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
old mode 100755
new mode 100644
index 9d7b9b544bff953662bab86f095823c5c7b3075b..f86806102d4a217ae4bb7355b36ca10d96ca4a05
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -61,14 +61,21 @@ class DepthwiseConvPE : public PE {
     args.image.channels = input->shape().channel();
     args.image.height = input->shape().height();
     args.image.width = input->shape().width();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
+    auto paddings = *param.paddings;
+    args.image.pad_width = param.paddings[2];
+    args.image.pad_height = param.paddings[0];
     args.image.scale_address = input->scale();
     args.output.address = output->data<void>();
     args.output.scale_address = output->scale();
     args.out_width = param.output->shape().width();
     args.out_height = param.output->shape().height();
     args.sub_conv_num = 1;
+    bool pad_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    if (!pad_equal) {
+      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
+                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+    }
     param.args = args;
 
     inplace_.relu_enable = param_.relu.enabled;
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
index fd3be1f463d3bfce925cc4ce5444d119c33e5692..5bb4f5285a48c7696b1f0f78a9b1c4fe6a9d76c5 100644
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -45,13 +45,14 @@ class PoolingPE : public PE {
 
     PoolingArgs args = {0};
     args.mode = param_.type;
+    auto paddings = *param_.paddings;
     args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
     args.image.address = input->data<float16>();
     args.image.channels = input->shape().channel();
     args.image.height = input->shape().height();
     args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = param_.paddings[1];
+    args.image.pad_height = paddings[0];
+    args.image.pad_width = paddings[2];
     args.image.scale_address = input->scale();
     args.output.address = output->mutableData<float16>();
     args.output.scale_address = output->scale();
@@ -76,12 +77,13 @@ class PoolingPE : public PE {
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
+    auto paddings = *param_.paddings;
 
     int image_height = input->shape().height();
     int image_width = input->shape().width();
     int image_channels = input->shape().channel();
-    int image_pad_h = param_.paddings[0];
-    int image_pad_w = param_.paddings[1];
+    int image_pad_h = paddings[0];
+    int image_pad_w = paddings[2];
     int kernel_height = param_.kernelSize[1];
     int kernel_width = param_.kernelSize[0];
     int kernel_step_h = param_.strides[0];
diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc
index ad5bed5be91298744abc0675bf12adb117afb60b..954fad8c916e152c5de06ce285b4ac17ecf22a01 100644
--- a/lite/backends/npu/builder.cc
+++ b/lite/backends/npu/builder.cc
@@ -142,21 +142,25 @@ ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
 
 int CvtActMode(std::string act_type) {
   int act_mode = 1;
-  if (act_type == "sigmod") {
+  if (act_type == "sigmoid") {
     act_mode = 0;
   } else if (act_type == "relu") {
     act_mode = 1;
   } else if (act_type == "tanh") {
     act_mode = 2;
+  } else if (act_type == "relu_clipped") {
+    act_mode = 3;
   } else if (act_type == "elu") {
     act_mode = 4;
+  } else if (act_type == "leaky_relu") {
+    act_mode = 5;
   } else if (act_type == "abs") {
     act_mode = 6;
   } else if (act_type == "softsign") {
     act_mode = 8;
   } else if (act_type == "softplus") {
     act_mode = 9;
-  } else if (act_type == "hardsigmoid") {
+  } else if (act_type == "hard_sigmoid") {
     act_mode = 10;
   } else {
     // TODO(hong19860320) support more activation mode
diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h
index 02f7071a4e1c5436cce4b4956aa5529fd74be282..70200354fbab15f043a537300e92e2a26a3d739e 100644
--- a/lite/backends/npu/builder.h
+++ b/lite/backends/npu/builder.h
@@ -31,117 +31,6 @@
 
 // Extended Ops of HIAI DDK
 namespace ge {
-/**
- * Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
- * The inputs must be two-dimensional matrices and the inner dimension of "x1"
- * (after being transposed if transpose_x1 is true) must match the outer
- * dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
- *      x : the first input tensor, must be non const op.
- *      w : the second input tensor, must be const op.
- *      bias: the optional bias tensor, must be const op.
- * <Output>
- *      y : the output tensor.
- * <Attr>
- *      has_bias: If true, enable input bias.
- */
-REG_OP(MatMul)
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(w, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT}))  // bias must be const input
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(has_bias, AttrValue::BOOL{false})  // when has input::bias,set true
-    .OP_END();
-
-/**
- * Computes the gradients of convolution with respect to the input.
- * <Input>
- *      input_sizes : An integer vector representing the shape of input,
- * where input is a 4-D [batch, height, width, channels] tensor.
- *      filter : the filter tensor, with shape [H , W, filter_channel,
- * filter_number], filter_channel must be same as x channel.
- *      x :  The input tensor.
- * <Output>
- *      y : The output tensor.
- * <Attr>
- *      format: 0: NCHW. 1: NHWC
- *      group : 1: default
- *      num_output : 0: default, num_output must be equal to
- * (filter_channel * group)
- *      pad : Padding for the beginning and ending along each axis
- *      stride : Stride along each axis.
- *      dilation : dilation value along each axis of the filter.
- *      pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
- *      bias_term : 0: default
- *      kernel : The shape of the convolution kernel
- */
-REG_OP(Deconvolution)
-    .INPUT(input_sizes, TensorType({DT_UINT8}))
-    .INPUT(filter, TensorType({DT_FLOAT}))
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(mode, AttrValue::INT{1})
-    .ATTR(format, AttrValue::INT{1})
-    .ATTR(group, AttrValue::INT{1})
-    .ATTR(num_output, AttrValue::INT{0})
-    .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
-    .ATTR(stride, AttrValue::LIST_INT({1, 1}))
-    .ATTR(dilation, AttrValue::LIST_INT({1, 1}))
-    .ATTR(pad_mode, AttrValue::INT{0})
-    .ATTR(bias_term, AttrValue::INT{0})
-    .ATTR(kernel, AttrValue::LIST_INT({0, 0}))
-    .OP_END();
-
-/**
- * Resize images to size using bilinear interpolation.
- * <Input>
- *      x : The tensor of 4-D
- *      w : A int32 Tensor of 2 elements: [height, width].
- * <Output>
- *      y : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels.
- *      output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
- * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
- * controled by the [height, width] of w.
- *      shrink_factor : shrink factor.
- *      zoom_factor : zoom factor.
- *      pad_begin : begin of pad.
- *      pad_end : end of pad.
- */
-REG_OP(ResizeBilinear)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
-    .INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .ATTR(output_dim_mode, AttrValue::INT{2})
-    .ATTR(shrink_factor, AttrValue::INT{1})
-    .ATTR(zoom_factor, AttrValue::INT{1})
-    .ATTR(pad_begin, AttrValue::INT{0})
-    .ATTR(pad_end, AttrValue::INT{0})
-    .OP_END();
-
-/**
- * Resize images to size using nearest neighbor interpolation.
- * <Input>
- *      image : Resize images to size using nearest neighbor interpolation.
- *      size : Must be one dimension and two  elements
- * <Output>
- *      output : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels. Defaults to false
- */
-REG_OP(ResizeNearestNeighbor)
-    .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .INPUT(size, TensorType({DT_INT32}))
-    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .OP_END();
-
 /**
  * Pads a tensor.
  * <Input>
diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc
index 357ac8c2d6ae340743fa713641e3e89449f1489f..93e176f9ed102f0675c987e57ddde6088158ec97 100644
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
   do {                                                               \
     cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
     if (cl_func##_ == nullptr) {                                     \
-      LOG(ERROR) << "Cannot find the " << #cl_func                   \
+      LOG(FATAL) << "Cannot find the " << #cl_func                   \
                  << " symbol in libOpenCL.so!";                      \
       break;                                                         \
     }                                                                \
diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt
index 2dea4364d5ee2d11d6d266935fad2a1180954369..a89107632341cf063ac3166aa9890ff383e3383f 100644
--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -50,7 +50,8 @@ math_library(unpooling)
 math_library(vol2col)
 ## math_library(prelu)
 math_library(tree2col DEPS math_function)
-
+math_library(sequence_topk_avg_pooling)
+math_library(search_fc DEPS blas dynload_mklml)
 # cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 # cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 # cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index bbe35b4de5508c70496e5c8566c8d1b982a7155c..8d61fb3bbb97705c697fba934e6cab9424f85bad 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/beam_search.h"
 #include <algorithm>
+#include <cmath>
 #include <map>
 #include "lite/fluid/lod.h"
 
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
index 9da239f9c63371350403cc0bd0eecc94eab87590..ab6c1edb481f914d5751149aca2595fee550ca51 100644
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -49,7 +49,7 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
 
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
@@ -130,7 +130,7 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
@@ -213,7 +213,7 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
@@ -629,7 +629,7 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021
--- /dev/null
+++ b/lite/backends/x86/math/search_fc.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/search_fc.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SearchFcFunctor<lite::TargetType::kX86, T> {
+ public:
+  void operator()(const lite::X86Context& context,
+                  const lite::Tensor& bottom,
+                  const lite::Tensor& w,
+                  const lite::Tensor& b,
+                  lite::Tensor* top,
+                  int out_size) {
+    int batch = bottom.dims()[0];
+
+    int _out = w.dims()[0];  // 100
+    int _in = w.dims()[1];   // 228
+
+    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
+
+    const auto bottom_data = bottom.data<T>();
+    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    const auto weights = w.data<T>();
+    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
+    call_gemm<lite::X86Context, T>(blas,
+                                   CblasNoTrans,
+                                   CblasTrans,
+                                   batch,
+                                   _out,
+                                   _in,
+                                   1.0f,
+                                   bottom_data,
+                                   weights,
+                                   0.0f,
+                                   top_data);
+    if (true) {
+      const auto* bias_data = b.data<T>();
+      for (int i = 0; i < batch; ++i) {
+        // add bias here
+        sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out);
+      }
+    }
+  }
+
+  // private:
+};
+
+#define DEFINE_FUNCTOR(type) \
+  template class SearchFcFunctor<lite::TargetType::kX86, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/search_fc.h b/lite/backends/x86/math/search_fc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e415c396023dbc10358992012197f4cfebac554f
--- /dev/null
+++ b/lite/backends/x86/math/search_fc.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/mklml.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void call_gemm(const BlasT<lite::TargetType::kX86, T> blas,
+               const CBLAS_TRANSPOSE TransA,
+               const CBLAS_TRANSPOSE TransB,
+               const int M,
+               const int N,
+               const int K,
+               const T alpha,
+               const T* A,
+               const T* B,
+               const T beta,
+               T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans),
+              (TransB == CblasTrans),
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              B,
+              beta,
+              C);
+#endif  // !__NAIVE_GEMM__
+}
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(
+        z + jjj,
+        _mm256_add_px(_mm256_load_px(x + jjj), _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+template <lite::TargetType Target, typename T>
+class SearchFcFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& X,
+                  const lite::Tensor& W,
+                  const lite::Tensor& b,
+                  lite::Tensor* Out,
+                  int out_size);
+};
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) macro(float);
diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..035a7923c70f91cf27f1d845f68110f8f33cb73d
--- /dev/null
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/sequence_topk_avg_pooling.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+template <typename T>
+void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) {
+  size_t real_k = k < length ? k : length;
+
+  std::vector<T> v(data, data + length);
+
+  std::vector<int> topk_pos;
+  T min_val = -10000000.0;
+  while (topk_pos.size() < real_k) {
+    T max_val = min_val;
+    int max_pos = -1;
+    for (int i = 0; i < length; ++i) {
+      if (v[i] > max_val) {
+        max_pos = i;
+        max_val = v[i];
+      }
+    }
+
+    assert(max_pos >= 0);
+
+    topk_pos.push_back(max_pos);
+    v[max_pos] = min_val;
+  }
+
+  assert(topk_pos.size() > 0);
+  while (topk_pos.size() < (size_t)k) {
+    topk_pos.push_back(-1);
+  }
+
+  for (size_t i = 0; i < topk_pos.size(); ++i) {
+    pos[i] = topk_pos[i];
+  }
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
+ public:
+  void operator()(const lite::Tensor& in,
+                  const lite::Tensor& row,
+                  const lite::Tensor& col,
+                  lite::Tensor* out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks) {
+    auto k_num = topks.size();
+    auto max_k = topks[topks.size() - 1];
+    std::vector<int64_t> vec_pos_shape;
+    auto in_lod = in.lod()[0];
+    auto row_lod = row.lod()[0];
+    auto col_lod = col.lod()[0];
+    int batch_size = row_lod.size() - 1;
+    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+    vec_pos_shape.push_back(pos_total_size);
+    lite::DDim dims(vec_pos_shape);
+    pos->Resize(dims);
+    auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
+
+    int offset = 0;
+    std::vector<size_t> vec_out_lod;
+    vec_out_lod.reserve(batch_size + 1);
+    for (int i = 0; i <= batch_size; ++i) {
+      offset = row_lod[i];
+      vec_out_lod.push_back(offset);
+    }
+
+    lite::LoD lod_temp;
+    lod_temp.push_back(vec_out_lod);
+    out->set_lod(lod_temp);
+
+    auto in_data = in.data<T>();
+    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+
+    T* sum_data = new T[max_k];
+    for (int i = 0; i < batch_size; ++i) {
+      int total_size = in_lod[i + 1] - in_lod[i];
+      int row_size = row_lod[i + 1] - row_lod[i];
+      int col_size = col_lod[i + 1] - col_lod[i];
+
+      CHECK_EQ(total_size, channel_num * row_size * col_size)
+          << "size wrong in sequence_topk_avg_pooling_op!";
+
+      int feature_num = row_size * col_size;
+      for (int j = 0; j < channel_num; ++j) {
+        auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num;
+
+        for (int r = 0; r < row_size; ++r) {
+          auto row_data = input_offset_feature_data + r * col_size;
+          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
+                                r * channel_num * max_k + j * max_k;
+          auto out_slice_data = out_data + row_lod[i] * channel_num * k_num +
+                                r * channel_num * k_num + j * k_num;
+
+          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == -1) {
+            sum_data[0] = 0.0;
+          } else {
+            sum_data[0] = row_data[pos_slice_data[0]];
+          }
+          for (int k = 1; k < max_k; ++k) {
+            if (pos_slice_data[k] == -1) {
+              sum_data[k] = sum_data[k - 1];
+            } else {
+              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
+            }
+          }
+          for (size_t k = 0; k < k_num; ++k) {
+            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
+          }
+        }
+      }
+    }
+    delete[] sum_data;
+  }
+};
+
+#define DEFINE_FUNCTOR(type) \
+  template class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.h b/lite/backends/x86/math/sequence_topk_avg_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..78d458c4d8fe0bf5a117cb5ad23d44bf0b7f3471
--- /dev/null
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <typename T>
+void get_topk_pos(
+    const T* data, int length, int k, int* pos, bool debug = false);
+
+template <lite::TargetType Target, typename T>
+class SequenceTopkAvgPoolingFunctor {
+ public:
+  void operator()(const lite::Tensor& X,
+                  const lite::Tensor& ROW,
+                  const lite::Tensor& COLUMN,
+                  lite::Tensor* Out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks);
+};
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) macro(float);
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index b02ef8fed6ebd62282352f1e3cb6819a0f66885e..641302cd2d3739b08c18ca010ee72b7ffde9198c 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -100,7 +100,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
 #----------------------------------------------- NOT CHANGE -----------------------------------------------
 lite_cc_library(kernel SRCS kernel.cc
         DEPS context type_system target_wrapper any op_params tensor
-        PROFILE_DEPS basic_profiler
+        PROFILE_DEPS lite_profiler
   )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
   cpp_op_desc tensor
@@ -114,7 +114,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
 
 lite_cc_library(program SRCS program.cc
     DEPS op kernel model_parser ${ops} ${cpp_wrapper}
-    PROFILE_DEPS basic_profiler)
+    PROFILE_DEPS lite_profiler)
 
 if (NOT LITE_ON_TINY_PUBLISH)
   lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index c59c078787b9a6778227ba6ba51230d1fc2104cb..561a508d20f1db9283a410b8ee35dd851149429c 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -37,6 +37,9 @@ void TestCase::CreateInstruction() {
   // prepare context
   (*it)->SetContext(std::move(ctx_));
   instruction_.reset(new Instruction(op, std::move(*it)));
+#ifdef LITE_WITH_PROFILE
+  instruction_->set_profiler(new profile::Profiler());
+#endif
 }
 
 void TestCase::PrepareInputsForInstruction() {
diff --git a/lite/core/context.h b/lite/core/context.h
index 19238f1a9b609c794a3dfe9763a8becdcca8ad16..5063600d3621f28dee8cfa91f79ae3287853f7ab 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -253,6 +253,13 @@ class Context<TargetType::kCUDA> {
 
   std::string name() const { return "CUDAContext"; }
 
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
+    return *this;
+  }
+
  private:
   int device_id_;
   // overall information
@@ -345,7 +352,6 @@ class ContextScheduler {
 
   std::unique_ptr<KernelContext> NewContext(TargetType target) {
     std::unique_ptr<KernelContext> ctx(new KernelContext);
-
     switch (target) {
       case TARGET(kHost):
         kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
@@ -416,6 +422,7 @@ class ContextScheduler {
   void InitContext() {
     kernel_contexts_[Type].As<ContextT>().InitOnce();
   }
+
   ContextScheduler() {
     InitContext<TargetType::kHost, HostContext>();
 #ifdef LITE_WITH_X86
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 166c04c000d345eb39822d1d67321a1c6a05e9a5..f5b757ac3ccd6310f6a6fd9fe6483d28ff7adbc6 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -1039,7 +1039,7 @@ int DeviceInfo::Setup() {
               << ", max freq: " << max_freqs_[i]
               << ", min freq: " << min_freqs_[i]
               << ", cluster ID: " << cluster_ids_[core_ids_[i]]
-              << ", CPU ARCH: A" << archs_[i];
+              << ", CPU ARCH: A" << static_cast<int>(archs_[i]);
   }
   LOG(INFO) << "L1 DataCache size is: ";
   for (int i = 0; i < core_num_; ++i) {
@@ -1093,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
       RequestPowerRandLowMode(shift_num, thread_num);
       break;
     default:
-      LOG(FATAL) << "Unsupported power mode: " << mode;
+      LOG(FATAL) << "Unsupported power mode: " << static_cast<int>(mode);
       break;
   }
   if (active_ids_.empty()) {
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 05d7a6b333810a8dc988d84a281f096babe8929f..86193235a2984b15a33c2eeaff15865d9f126eeb 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -31,7 +31,7 @@
 #include "lite/utils/replace_stl/stream.h"
 
 #ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
+#include "lite/core/profile/profiler.h"
 #endif  // LITE_WITH_PROFILE
 
 namespace paddle {
@@ -58,7 +58,10 @@ class KernelBase {
   virtual void Run() = 0;
 
 #ifdef LITE_WITH_PROFILE
-  void SetProfileID(uint32_t id) { profile_id_ = id; }
+  void SetProfiler(profile::Profiler* profiler, int id) {
+    profiler_ = profiler;
+    profile_id_ = id;
+  }
 #endif
 
   void Launch() {
@@ -82,10 +85,12 @@ class KernelBase {
 #endif
 
 #ifdef LITE_WITH_PROFILE
-    if (profile_id_ >= 0) {
-      profile::ProfileBlock x(profile_id_, "kernel");
-      Run();
-    }
+    CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
+                        "When LITE_WITH_PROFILE is defined, please set a "
+                        "Profiler for Instruction.";
+    profiler_->StartTiming(profile_id_, ctx_.get());
+    Run();
+    profiler_->StopTiming(profile_id_, ctx_.get());
 #else
     Run();
 #endif
@@ -175,6 +180,7 @@ class KernelBase {
   bool is_first_epoch_{true};
 
 #ifdef LITE_WITH_PROFILE
+  profile::Profiler* profiler_{nullptr};
   int profile_id_{-1};
 #endif
 };
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index ec94f69be1e5c107cc61af80cdea7d006436021b..eefada3f998d5ad533c832fcd2a2c0b6c90d23d0 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -110,7 +110,7 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
       TargetWrapper<TARGET(kBM)>::MemcpySync(
           dst, src, size, IoDirection::DtoD);
       break;
-#endif 
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index ff064fb2ee93fc540e932da36fb07bb78eef989a..0d11b47db6a7f767f8cd032877d8647b0872b8d4 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -47,4 +47,5 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
                   paddle::lite::mir::ConvActivationFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index d9d9c1bbf55bd33c31aa9a22de934d4eae8657c6..5ab5f8c0a4797e51cce656de43883a68d4931e9b 100644
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -45,4 +45,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kX86)});
+    .ExcludeTargets({TARGET(kX86), TARGET(kXPU)});
diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
index fd9aadc5d01c2cb3b6c7a3e888503072a0798725..b1b492ce030c7a46d8b23936c1661f3d743eb9cb 100644
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
@@ -46,4 +46,5 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
                   paddle::lite::mir::ConvElementwiseFusePass)
-    .BindTargets({TARGET(kAny)});
+    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
index af66f5ab66bd09907cb9d28f00f17d983e54c252..e4391cd24287cafe457074733ba73208288c3375 100644
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -35,4 +35,5 @@ void ElementwiseAddActivationFusePass::Apply(
 REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
                   paddle::lite::mir::ElementwiseAddActivationFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("fusion_elementwise_add_activation");
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index ed10f06f5651f4000485279d682689101d80aa5a..7fc449219251bbd7e639e8092099f43fe8eca626 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -33,4 +33,5 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("fc");
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index f823f45dc66f8ef6cc67cbb9b0d9860c86ec9340..da611e4490f4ba7268d9011b3dbb391a63a88305 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -396,6 +396,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     op_desc->SetAttr<float>("input_scale", scale_value);
     op_desc->SetInput("X", {input_act_node->arg()->name});
     IR_NODE_LINK_TO(input_act_node, quantized_node)
+    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
+    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
 
     // delete nodes and edges
     std::unordered_set<const Node*> nodes2rm = {input_scale_node,
@@ -440,6 +442,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     op_desc->SetInput("Y", {input_act_right_node->arg()->name});
     IR_NODE_LINK_TO(input_act_left_node, quantized_node)
     IR_NODE_LINK_TO(input_act_right_node, quantized_node)
+    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
+    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
 
     // delete nodes and edges
     std::unordered_set<const Node*> nodes2rm = {input_scale_left_node,
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 1f2355e8a3205cce3410bd2cb6ac4a17d8fde602..4f41ba4a601ae763e6fa48c0a98de238252ea7c2 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -255,4 +255,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle
 
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
-    .BindTargets({TARGET(kARM)});
+    .BindTargets({TARGET(kARM)})
+    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)});
diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h
index 4de0fdbf357160348a403d3c8527fe62891237f0..4e8c8be292bbd5e7f46664378634d4f1aeed2965 100644
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
@@ -52,34 +52,44 @@ class Pass {
 
   // Bind targets. At runtime, there must be one device in the bound targets.
   void BindTargets(const std::set<TargetType>& targets) {
-    std::set<TargetType> res;
     for (const auto& target : targets) {
       const std::set<TargetType>& universe = ExpandValidTargets(target);
       std::set_union(bound_targets_.begin(),
                      bound_targets_.end(),
                      universe.begin(),
                      universe.end(),
-                     std::inserter(res, res.begin()));
+                     std::inserter(bound_targets_, bound_targets_.begin()));
     }
-    bound_targets_ = res;
   }
 
   // Exclude targets. At runtime, there must be one device in the bound targets.
+  // Disable the pass if one of the valid devices is in the excluded targets.
   void ExcludeTargets(const std::set<TargetType>& targets) {
-    std::set<TargetType> res;
     for (const auto& target : targets) {
       const std::set<TargetType>& universe = ExpandValidTargets(target);
-      std::set_difference(bound_targets_.begin(),
-                          bound_targets_.end(),
-                          universe.begin(),
-                          universe.end(),
-                          std::inserter(res, res.begin()));
+      std::set<TargetType> updated_bound_targets;
+      std::set_difference(
+          bound_targets_.begin(),
+          bound_targets_.end(),
+          universe.begin(),
+          universe.end(),
+          std::inserter(updated_bound_targets, updated_bound_targets.begin()));
+      bound_targets_ = updated_bound_targets;
+      std::set_union(
+          excluded_targets_.begin(),
+          excluded_targets_.end(),
+          universe.begin(),
+          universe.end(),
+          std::inserter(excluded_targets_, excluded_targets_.begin()));
     }
-    bound_targets_ = res;
   }
 
   // Get all bound targets.
-  const std::set<TargetType>& Targets() const { return bound_targets_; }
+  const std::set<TargetType>& BoundTargets() const { return bound_targets_; }
+  // Get all excluded targets.
+  const std::set<TargetType>& ExcludedTargets() const {
+    return excluded_targets_;
+  }
 
   // Some passes are only available on qualified kernels and need to be
   // explicitly declared.
@@ -116,6 +126,7 @@ class Pass {
   std::string name_;
   std::string doc_;
   std::set<TargetType> bound_targets_;
+  std::set<TargetType> excluded_targets_;
   std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_;
 };
 
diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc
index 4f6be2c186d2d940a799201812cce397a9e94eb4..5bddfcbd3c17288546dc6e0a0b4ebf984d26c504 100644
--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
@@ -47,10 +47,34 @@ bool KernelRegistered(const std::string name, const Place& place) {
   return false;
 }
 
-bool PassMatchesTarget(const mir::Pass& pass, TargetType target) {
-  const auto& targets = pass.Targets();
-  if (targets.find(TARGET(kAny)) != targets.end()) return true;
-  return (targets.find(target) != targets.end());
+bool PassMatchesTarget(const mir::Pass& pass,
+                       const std::set<TargetType>& targets) {
+  // Whether the pass is suitable for targets ? The condition is the
+  // intersection of targets and pass's bound targets is not empty, besides the
+  // intersection of targets and pass's excluded targets is empty. The formula
+  // is as follows: matched = !empty(targets ^ pass.bound_targets) &&
+  // empty(targets ^ pass.excluded_targets), where ^ is intersection operation.
+  const auto& bound_targets = pass.BoundTargets();
+  bool matched = bound_targets.find(TARGET(kAny)) != bound_targets.end();
+  std::set<TargetType> inter_bound_targets;
+  std::set_intersection(
+      bound_targets.begin(),
+      bound_targets.end(),
+      targets.begin(),
+      targets.end(),
+      std::inserter(inter_bound_targets, inter_bound_targets.begin()));
+  matched |= !inter_bound_targets.empty();
+  const auto& excluded_targets = pass.ExcludedTargets();
+  matched &= excluded_targets.find(TARGET(kAny)) == excluded_targets.end();
+  std::set<TargetType> inter_excluded_targets;
+  std::set_intersection(
+      excluded_targets.begin(),
+      excluded_targets.end(),
+      targets.begin(),
+      targets.end(),
+      std::inserter(inter_excluded_targets, inter_excluded_targets.begin()));
+  matched &= inter_excluded_targets.empty();
+  return matched;
 }
 
 bool PassMatchesKernels(const mir::Pass& pass) {
diff --git a/lite/core/mir/pass_utils.h b/lite/core/mir/pass_utils.h
index 942f64bf3190be1f399ac6f014be0881b1450d9b..57e8da5e461f40bd79ece8139c3290e17e762996 100644
--- a/lite/core/mir/pass_utils.h
+++ b/lite/core/mir/pass_utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <set>
 #include <string>
 #include "lite/core/mir/pass.h"
 
@@ -24,7 +25,8 @@ namespace lite {
 bool KernelRegistered(const std::string name, const Place& place);
 
 // Check if the pass hits the hardware target.
-bool PassMatchesTarget(const mir::Pass& pass, TargetType target);
+bool PassMatchesTarget(const mir::Pass& pass,
+                       const std::set<TargetType>& targets);
 
 // Check if the pass hits all necessary operators.
 bool PassMatchesKernels(const mir::Pass& pass);
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index 7187ddcef6626888eaaf372f7b027aa5d9bd2a3a..cd54e2654c22b98cbacc9a73bef7770a029c0b30 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -48,7 +48,8 @@ class StaticKernelPickPass : public mir::StmtPass {
 
  private:
   // Score the kernel.
-  size_t KernelGrade(const lite::KernelBase& kernel,
+  size_t KernelGrade(const lite::mir::Node::Stmt& instruct,
+                     const lite::KernelBase& kernel,
                      const std::vector<Place>& places) {
     CHECK_GT(places.size(), 0) << "valid_places is empty.";
     float final_score{-1.};
@@ -66,10 +67,11 @@ class StaticKernelPickPass : public mir::StmtPass {
     // valid_places.size() as default.
     //         where i is the place's index in valid_places array.
     // score:  score is the weighted sum of target、percision and layout
-    for (int i = 0; i < place_size; ++i) {
+    for (size_t i = 0; i < place_size; ++i) {
       const auto& place = places[i];
       float weight = static_cast<float>(place_size - i) / place_size;
       size_t score{};
+
       // The more important factor comes first
       if (kernel_pick_factors_.IsTargetConsidered() &&
           (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
@@ -82,8 +84,12 @@ class StaticKernelPickPass : public mir::StmtPass {
           (place.precision == kernel.precision() ||
            kernel.precision() == PRECISION(kAny) ||
            place.precision == PRECISION(kAny))) {
-        score += kMax / static_cast<int>(
-                            core::KernelPickFactor::Factor::PrecisionFirst);
+        // score skipped, if kernel is int8, but op is not int8
+        if (!(kernel.precision() == PRECISION(kInt8) &&
+              !instruct.op_info()->HasAttr("enable_int8"))) {
+          score += kMax / static_cast<int>(
+                              core::KernelPickFactor::Factor::PrecisionFirst);
+        }
       }
       VLOG(4) << "[score s2]:" << score;
       if (kernel_pick_factors_.IsDataLayoutConsidered() &&
@@ -102,17 +108,17 @@ class StaticKernelPickPass : public mir::StmtPass {
 
     VLOG(4) << "[score(final)]:" << final_score;
     VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " "
-            << DataLayoutToStr(winner_place.layout) << " "
+    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+            << " " << DataLayoutToStr(winner_place.layout) << " "
             << TargetToStr(winner_place.target);
     VLOG(4) << " ===> kernel.place():"
             << PrecisionToStr(kernel.place().precision) << " "
             << DataLayoutToStr(kernel.place().layout) << " "
             << TargetToStr(kernel.place().target);
     VLOG(4) << "kernel.op_type():" << kernel.op_type();
-    VLOG(4) << "picker tactic " << kernel_pick_factors_;
-    VLOG(4) << "kernel place " << kernel.place().DebugString();
-    VLOG(4) << "picker place " << winner_place.DebugString();
+    VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
+    VLOG(4) << "kernel place:" << kernel.place().DebugString();
+    VLOG(4) << "winner_picker place:" << winner_place.DebugString();
     VLOG(4) << "------------------------------";
 
     // The data layout is not considered, for the input and output arguments
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc
index c83cd70d8225a0b33a50ebdad331283f377e0059..65c29aa68f1c8c5f5702ca97d27f9579edc7a951 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
@@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph(
   // persistable=true, Sothat the model parser can recognize it and save it to
   // param files
   if (!lite::npu::BuildModel(inputs, outputs, weight)) {
-    LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("Build NPU graph failed.");
+    LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
+  } else {
+    LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
   }
-  LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
   return weight_var_name;
 }
 
@@ -175,40 +175,19 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     supported_op_types.push_back(i.first);
   }
 
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "[NPU] Converting Subgraph " << id;
-      GenNPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "[NPU] Build NPU graph failed.";
-    throw std::runtime_error("[NPU] Build NPU graph failed.");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
+  int num_subgraph = FuseSubgraph(graph, supported_op_types);
+  InferOnce(graph);
+  auto op_nodes_all = ClassifySubgraph(graph);
+  CHECK_EQ(op_nodes_all.size(), num_subgraph);
+  int id = 1;
+  for (auto& op_nodes : op_nodes_all) {
+    LOG(INFO) << "[NPU] Converting Subgraph " << id;
+    GenNPUSubgraph(graph, op_nodes.second, id);
+    LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
+              << Visualize(graph.get());
+    id++;
   }
 }
-
-std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
-  LOG(INFO) << "[NPU] program insts.size " << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
 }  // namespace subgraph
 }  // namespace mir
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h
index 823ca5f1f624a9e920a5f395a9d5098c5ea52929..5b1a98c6ed0e10f4fae8832b9ba3c5f98f3d9ed9 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.h
@@ -35,7 +35,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
   using key2nodes_t = std::map<std::string, Node*>;
 
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
 
  protected:
   // nodes2cvt: op nodes to convert
@@ -54,9 +53,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
   void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
                       const std::unordered_set<Node*>& op_nodes,
                       int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
 };
 
 }  // namespace subgraph
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
index 95339d6175c98f22d542db24f02d6d714ccbe2a8..1afb54c692592ca42d8b120dcf1a91922e19149c 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
@@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) {
       TestModel(FLAGS_model_dir,
                 FLAGS_model_file,
                 FLAGS_params_file,
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}},
+                {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)},
+                 lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
                 input_tensor_shape,
                 FLAGS_optimized_model_dir + "/NPU");
   // verify results
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
index 319e1e51feb917b803753807ddbb1f72c2cb7084..4340cb4ee3cccad32db9bc333b5856386812c62a 100644
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
@@ -115,10 +115,10 @@ std::string GenerateXPUProgramPass::BuildXPUGraph(
                              graph_ctx.params,
                              &ordered_cvted_var_nodes,
                              weight)) {
-    LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("[XPU] Build XPU graph failed.");
+    LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
+  } else {
+    LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
   }
-  LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
   return weight_var_name;
 }
 
@@ -162,40 +162,19 @@ void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     supported_op_types.push_back(i.first);
   }
 
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "[XPU] Converting Subgraph " << id;
-      GenXPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "[XPU] Build XPU graph failed.";
-    throw std::runtime_error("[XPU] Build XPU graph failed.");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
+  int num_subgraph = FuseSubgraph(graph, supported_op_types);
+  InferOnce(graph);
+  auto op_nodes_all = ClassifySubgraph(graph);
+  CHECK_EQ(op_nodes_all.size(), num_subgraph);
+  int id = 1;
+  for (auto& op_nodes : op_nodes_all) {
+    LOG(INFO) << "[XPU] Converting Subgraph " << id;
+    GenXPUSubgraph(graph, op_nodes.second, id);
+    LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
+              << Visualize(graph.get());
+    id++;
   }
 }
-
-std::unique_ptr<RuntimeProgram> GenerateXPUProgramPass::GenProgram() {
-  LOG(INFO) << "[XPU] program insts.size=" << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
 }  // namespace subgraph
 }  // namespace mir
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h
index cf121ae9503201e8cf6be40fe9054ccaf6e4b172..777642cfb6c61671a8aeb119c70664297573d9a7 100644
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h
@@ -35,7 +35,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass {
   using key2nodes_t = std::map<std::string, Node*>;
 
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
 
  protected:
   // nodes2cvt: op nodes to convert
@@ -58,9 +57,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass {
   void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
                       const std::unordered_set<Node*>& op_nodes,
                       int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
 };
 
 }  // namespace subgraph
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index 9d63dcbb38b2354c567ca1e0d434ac1a4be424c1..b3b7a858f68367ac789f390c6bd3bd94873f77d5 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
   for (auto& kernel : kernels) {
     const Type* in_arg_ty = kernel->GetInputDeclType("Input");
     const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-#ifdef LITE_WITH_OPENCL
+
     // layout kernel choose
     //   must ignore [layout check] for layout of kernels's input and output
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
-        DeviceCompatibleTo(*in_arg_ty, from) &&
-        out_arg_ty->layout() == to.layout()) {
-#else
-    if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->layout() == to.layout()) {
-#endif
+    // note: replace LITE_WITH_OPENCL macro with judge input and output target
+    // of layout_trans
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
+        (TargetCompatibleTo(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
+         DeviceCompatibleTo(*in_arg_ty, from) &&
+         out_arg_ty->layout() == to.layout())) {
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->layout() == to.layout()) {
       is_found = true;
+    }
+    if (is_found) {
       selected_kernels.emplace_back(std::move(kernel));
       // we pick the kernel
       layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
       break;
     }
   }
+
   CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
                   << in->AsArg().name << "->" << to << ":"
                   << inst_node->AsStmt().op_info()->Type();
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index 7a3277786553d8a256c48e9e5c99530b8d5681b5..b008faa687474a88988adb9da81c594306298b26 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
     VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
     VLOG(4) << "to:" << to << "\n";
 
-// kernel choose branch for opencl backend
-//   judge inst's target whether is kOpenCL
-//   Note: to == *decl_arg_type == in of inst, not output of last inst
-#ifdef LITE_WITH_OPENCL
+    // kernel choose branch for opencl backend
+    //   judge inst's target whether is kOpenCL
+    //   Note: to == *decl_arg_type == in of inst, not output of last inst
     // ignore [layout check] for layout between [to] and [from]
     //   Because all of origin opencl insts in model, are not default layout
     //   NCHW,
@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
     //     [*decl_arg_type] -> [to]: input of inst, not output of last
     //     [in_arg_ty]: in of io_copy
     //     [out_arg_ty]: out of io_copy
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
-        DeviceCompatibleTo(*in_arg_ty, from) &&
-        TargetCompatibleTo(*out_arg_ty, to)) {
-      VLOG(4) << "do nothing. opencl found";
-#else
-    if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->target() == to.target()) {
-#endif
+    //
+    // noto: replace LITE_WITH_OPENCL macro with judge input and output target
+    // of io_copy
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
+        (TargetCompatibleTo(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
+         DeviceCompatibleTo(*in_arg_ty, from) &&
+         TargetCompatibleTo(*out_arg_ty, to))) {
+      VLOG(4) << "picked, opencl found";
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->target() == to.target()) {
       VLOG(4) << "picked";
       is_found = true;
+    }
+
+    if (is_found) {
       selected_kernels.emplace_back(std::move(kernel));
       // we pick the kernel
       io_copy_inst->AsStmt(
           io_copy_type, std::move(selected_kernels), io_copy_op);
       break;
     }
+
     VLOG(4) << "not picked";
   }
+
   CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
                   << ":" << in->AsArg().name << " -> " << to << ":"
                   << inst_node->AsStmt().op_info()->Type();
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index fe6ecfd66df23bb704fafcbf94106f7ca973c4f1..3f5d161a56aafa7fd9d058fd404e65cb04572116 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
     }
   }
 
-  // Set the tye of the weight
-  void SetWeightType(Node* w, const LiteType& type) {
-// TODO(xg) to optimize this
-#ifdef LITE_WITH_FPGA
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
-
-#ifdef LITE_WITH_OPENCL
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
-
-#ifndef LITE_WITH_FPGA
-#ifndef LITE_WITH_OPENCL
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
-#endif
-#endif
+  // Set the type of the weight
+  void SetWeightType(Node* w,
+                     const LiteType& type,
+                     const std::map<std::string, bool>& lite_with_targets) {
+    VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
+    if (lite_with_targets.at("kFPGA")) {
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    } else if (lite_with_targets.at("kOpenCL")) {
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    } else {
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
+    }
   }
 
   void InferenceArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+
     VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
     for (auto& x : graph->StmtTopologicalOrder()) {
       auto& inst = x->AsStmt();
-// The IoCopyOp is a tool operator, it won't support the type inference.
-// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for
-// tool operator
-#ifndef LITE_WITH_FPGA
-#ifndef LITE_WITH_OPENCL
-      VLOG(3) << "inst.op_type() == 'io_copy', continue";
-      if (inst.op_type() == "io_copy") continue;
-#endif
-#endif
+      // The IoCopyOp is a tool operator, it won't support the type inference.
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
+      // for
+      // tool operator
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        if (inst.op_type() == "io_copy") continue;
+      }
       // deal with inputs
       VLOG(4) << "Infering op " << inst.op_info()->Repr();
       // TODO(zhaolong): Add check if the node's name in op's arguments.
@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
         if (!x_in->AsArg().type) {
           VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
           if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type);
+            SetWeightType(x_in, *type, lite_with_targets);
           } else {
             x_in->AsArg().type = type;
           }
@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
         if (!x_out->AsArg().type) {
           VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
           if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type);
+            SetWeightType(x_out, *type, lite_with_targets);
           } else {
             x_out->AsArg().type = type;
           }
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index 1400b254090b31c731a6633d5a3171d2f0c54d03..887ac3c9507b4fb36594c156b7b1b207cd7bb750 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -118,6 +118,8 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kCUDA, kAny, kNCHW);
   INIT_FOR(kCUDA, kAny, kAny);
   INIT_FOR(kCUDA, kInt8, kNHWC);
+  INIT_FOR(kCUDA, kInt64, kNCHW);
+  INIT_FOR(kCUDA, kInt64, kNHWC);
 
   INIT_FOR(kHost, kFloat, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 7ed632d864d0c7ee1e028787fa20717390f29b55..d78ae690f9b019dff7728bd3e95c0b1406bea463 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -145,6 +145,12 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //
 
               KernelRegistryForTarget<TARGET(kOpenCL),
                                       PRECISION(kFloat),
@@ -188,16 +194,6 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 22c5f193308c92d43cef45b663de97a3ba5958c7..38c9d0e29d5766dec21de76b740c1032ad44da7e 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/generate_program_pass.h"
@@ -49,23 +51,20 @@ class Optimizer {
     valid_places_ = valid_places;
     CHECK(!valid_places.empty()) << "At least one valid_place should be set";
     CHECK(!graph_) << "duplicate optimize found";
+
     graph_.reset(new mir::SSAGraph);
     graph_->Build(program, valid_places);
     graph_->SetValidPlaces(valid_places);
 
     SpecifyKernelPickTactic(kernel_pick_factor);
     InitTargetTypeTransformPass();
+
     if (passes.empty()) {
-      RunPasses(std::vector<std::string>{
-          {
- #if 0
-          "lite_quant_dequant_fuse_pass",     //
+      std::vector<std::string> passes_local{
+          {"lite_quant_dequant_fuse_pass",     //
            "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
            "lite_conv_bn_fuse_pass",           //
            "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
-           // This pass is disabled to force some opencl kernels selected for
-           // final running, otherwise, they will be fused to ARM fusion
-           // kernels, and the OpenCL devices will be discarded.
            // TODO(Superjomn) Refine the fusion related design to select fusion
            // kernels for devices automatically.
            "lite_conv_activation_fuse_pass",              //
@@ -74,11 +73,10 @@ class Optimizer {
            "lite_transpose_softmax_transpose_fuse_pass",  //
            "lite_interpolate_fuse_pass",                  //
            "identity_scale_eliminate_pass",               //
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA)
            "lite_elementwise_add_activation_fuse_pass",  //
-#endif     
-#endif        
-           "static_kernel_pick_pass",        // pick original kernel from graph     
+#endif
+           "static_kernel_pick_pass",        // pick original kernel from graph
            "variable_place_inference_pass",  // inference arg/var's
            // info(target/precision/layout/device)
            // using kernel info
@@ -107,17 +105,12 @@ class Optimizer {
            "argument_type_display_pass",  //
 
            "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
+           "argument_type_display_pass",
 
            "runtime_context_assign_pass",
-           "argument_type_display_pass",  //
-#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
-    !defined(LITE_WITH_XPU)
-           // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
-           "memory_optimize_pass",
-#endif
-           "argument_type_display_pass"
-           }});
+           "argument_type_display_pass",
+           "memory_optimize_pass"}};
+      RunPasses(passes_local);
     } else {
       RunPasses(passes);
     }
@@ -128,39 +121,13 @@ class Optimizer {
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
-#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
-    auto target_place = Place{
-#ifdef LITE_WITH_NPU
-        TARGET(kNPU),
-#endif
-#ifdef LITE_WITH_XPU
-        TARGET(kXPU),
-#endif
-        PRECISION(kFloat)};
-    if (std::find(valid_places_.begin(), valid_places_.end(), target_place) !=
-        valid_places_.end()) {
-#ifdef LITE_WITH_NPU
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateNPUProgramPass>(
-                          "generate_npu_program_pass");
-#endif
-#ifdef LITE_WITH_XPU
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateXPUProgramPass>(
-                          "generate_xpu_program_pass");
-#endif
-      try {
-        pass->Apply(graph_);
-        auto program = pass->GenProgram();
-        CHECK(exec_scope_);
-        program->set_exec_scope(exec_scope_);
-        return program;
-      } catch (...) {
-        LOG(WARNING) << "Build " << TargetToStr(target_place.target)
-                     << " program failed!";
-      }
-    }
-#endif
+    // Extra passes are applied for NPU and XPU, they depends on the shapes
+    // of input tensors. so GenRuntimeProgram() must be called after the shapes
+    // of input tensors are determined.
+    std::vector<std::string> subgraph_passes{"generate_npu_program_pass",
+                                             "generate_xpu_program_pass"};
+    RunPasses(subgraph_passes);
+
     auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
         "generate_program_pass");
     pass->Apply(graph_);
@@ -202,14 +169,16 @@ class Optimizer {
     for (auto& x : passes) {
       LOG(INFO) << "== Running pass: " << x;
       mir::Pass* pass = mir::PassManager::Global().LookUp(x);
-      CHECK(pass) << "Can not find pass: " << x;
-      bool matched = false;
+      if (!pass) {
+        LOG(INFO) << "   - Skip " << x << " because the pass isn't found.";
+        continue;
+      }
+      std::set<TargetType> targets;
       for (const auto& place : valid_places_) {
-        if (PassMatchesTarget(*pass, place.target)) {
-          matched = true;
-        }
+        targets.insert(place.target);
       }
-      matched = matched && PassMatchesKernels(*pass);
+      bool matched =
+          PassMatchesTarget(*pass, targets) && PassMatchesKernels(*pass);
       if (!matched) {
         LOG(INFO) << "   - Skip " << x
                   << " because the target or kernel does not match.";
diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt
index 54a239024413834cb30c6e135c378d10480863e7..b7ddd810af46a25e2c331c2f0364a72f466dc636 100644
--- a/lite/core/profile/CMakeLists.txt
+++ b/lite/core/profile/CMakeLists.txt
@@ -5,4 +5,5 @@ endif()
 lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler)
  
- 
+lite_cc_library(lite_profiler SRCS profiler.cc DEPS context)
+lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler)
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a51b769c8f46a5ca8cb9ed74740b93844882cb16
--- /dev/null
+++ b/lite/core/profile/profiler.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/profile/profiler.h"
+#include <map>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+int Profiler::NewTimer(const OpCharacter& ch) {
+  StatisUnit unit;
+  unit.character = ch;
+  if (ch.target == TargetType::kCUDA) {
+#ifdef LITE_WITH_CUDA
+    unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
+#else
+    LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
+                  "default x86 timer is used instead.";
+#endif
+  } else {
+    unit.timer.reset(new DeviceTimer<TargetType::kHost>());
+  }
+  units_.push_back(std::move(unit));
+  return units_.size() - 1;
+}
+
+void Profiler::StartTiming(const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  units_[index].timer->Start(ctx);
+}
+
+float Profiler::StopTiming(const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  return units_[index].timer->Stop(ctx);
+}
+
+std::string Profiler::Summary(bool concise) {
+  STL::stringstream ss;
+  auto cout_title = [&ss](const std::string& title, const std::string& name) {
+    // clang-format off
+    ss << "===== " << title << ": " << name << " =====" << std::endl;
+    ss << std::setw(25) << std::left << "Operator Type" \
+       << std::setw(40) << std::left << "Kernel Name"   \
+       << std::setw(10) << std::left << "Remark"        \
+       << std::setw(10) << std::left << "Avg (ms)"      \
+       << std::setw(10) << std::left << "Min (ms)"      \
+       << std::setw(10) << std::left << "Max (ms)"      \
+       << std::endl;
+    // clang-format on
+  };
+  if (concise) {
+    auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
+      return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+             (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+    };
+    std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
+    for (auto& unit : units_) {
+      auto ch = summary.find(unit.character);
+      if (ch != summary.end()) {
+        ch->second.avg += unit.timer->LapTimes().Avg();
+        ch->second.min += unit.timer->LapTimes().Min();
+        ch->second.max += unit.timer->LapTimes().Max();
+      } else {
+        TimeInfo info({unit.timer->LapTimes().Avg(),
+                       unit.timer->LapTimes().Min(),
+                       unit.timer->LapTimes().Max()});
+        summary.insert({unit.character, info});
+      }
+    }
+    cout_title("Concise Profiler Summary", name_);
+    for (const auto& item : summary) {
+      // clang-format off
+      ss << std::setw(25) << std::left << item.first.op_type      \
+         << std::setw(40) << std::left << item.first.kernel_name  \
+         << std::setw(10) << std::left << item.first.remark       \
+         << std::setw(10) << std::left << item.second.avg         \
+         << std::setw(10) << std::left << item.second.min         \
+         << std::setw(10) << std::left << item.second.max         \
+         << std::endl;
+      // clang-format on
+    }
+  } else {
+    cout_title("Detailed Profiler Summary", name_);
+    for (auto& unit : units_) {
+      // clang-format off
+      ss << std::setw(25) << std::left << unit.character.op_type        \
+         << std::setw(40) << std::left << unit.character.kernel_name    \
+         << std::setw(10) << std::left << unit.character.remark         \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Avg()  \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Min()  \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Max()  \
+         << std::endl;
+      // clang-format on
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fce8167cdd5383c2cc4ae5d641433582f0ee6a7
--- /dev/null
+++ b/lite/core/profile/profiler.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/profile/timer.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+struct TimeInfo {
+  float avg;
+  float min;
+  float max;
+};
+
+struct OpCharacter {
+  TargetType target;
+  std::string op_type{std::string("N/A")};
+  std::string kernel_name{std::string("N/A")};
+  std::string remark{std::string("N/A")};
+};
+
+struct StatisUnit {
+  std::unique_ptr<Timer> timer;
+  OpCharacter character;
+};
+
+class Profiler final {
+ public:
+  Profiler() = default;
+  explicit Profiler(const std::string& name) : name_(name) {}
+  int NewTimer(const OpCharacter& ch);
+  void StartTiming(const int index, KernelContext* ctx);
+  float StopTiming(const int index, KernelContext* ctx);
+  std::string Summary(bool concise = true);
+
+ private:
+  std::string name_{std::string("N/A")};
+  std::vector<StatisUnit> units_;
+};
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f49698ef4a8f83e4192a16801566fdcbd7baf9a
--- /dev/null
+++ b/lite/core/profile/test_timer.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include "lite/core/context.h"
+#include "lite/core/profile/profiler.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+TEST(timer, real_latency) {
+  Timer timer;
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop();
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop();
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+#ifdef LITE_WITH_CUDA
+TEST(gpu_timer, real_latency) {
+  DeviceTimer<TargetType::kCUDA> timer;
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  timer.Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop(&ctx);
+
+  (&timer)->Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop(&ctx);
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+TEST(profiler, real_latency) {
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  Profiler profiler("name");
+  profile::OpCharacter ch;
+  ch.target = TargetType::kCUDA;
+  ch.op_type = "operator/1";
+  ch.kernel_name = "kernel/1";
+  int idx = profiler.NewTimer(ch);
+  profiler.StartTiming(idx, &ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  profiler.StopTiming(idx, &ctx);
+  std::cout << profiler.Summary();
+}
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e86f0d7b9be4914bdf1a6874195276d3c1b61ee
--- /dev/null
+++ b/lite/core/profile/timer.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <chrono>  // NOLINT
+#include <list>
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+template <typename T>
+class TimeList {
+ public:
+  void Clear() { laps_t_.clear(); }
+  void Add(T t) { laps_t_.push_back(t); }
+  T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); }
+  T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); }
+  T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); }
+  size_t Size() const { return laps_t_.size(); }
+  T Avg() const {
+    if (!Size()) {
+      return 0;
+    }
+    return Sum() / Size();
+  }
+  const std::list<T>& Raw() const { return laps_t_; }
+
+ private:
+  std::list<T> laps_t_;
+};
+
+class Timer {
+ public:
+  Timer() = default;
+  virtual ~Timer() = default;
+
+  void Reset() { laps_t_.Clear(); }
+  void Start() { t_start_ = std::chrono::system_clock::now(); }
+  float Stop() {
+    t_stop_ = std::chrono::system_clock::now();
+    auto ts = std::chrono::duration_cast<std::chrono::microseconds>(t_stop_ -
+                                                                    t_start_);
+    float elapse_ms = 1000.f * static_cast<float>(ts.count()) *
+                      std::chrono::microseconds::period::num /
+                      std::chrono::microseconds::period::den;
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+  virtual void Start(KernelContext* ctx) { return Start(); }
+  virtual float Stop(KernelContext* ctx) { return Stop(); }
+  float AvgLapTimeMs() const { return laps_t_.Avg(); }
+  const TimeList<float>& LapTimes() const { return laps_t_; }
+
+ protected:
+  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
+  TimeList<float> laps_t_;
+};
+
+template <TargetType Target>
+class DeviceTimer final : public Timer {};
+
+#ifdef LITE_WITH_CUDA
+template <>
+class DeviceTimer<TargetType::kCUDA> final : public Timer {
+ public:
+  DeviceTimer() {
+    CUDA_CALL(cudaEventCreate(&e_start_));
+    CUDA_CALL(cudaEventCreate(&e_stop_));
+  }
+  ~DeviceTimer() {
+    CUDA_CALL(cudaEventDestroy(e_start_));
+    CUDA_CALL(cudaEventDestroy(e_stop_));
+  }
+  void Start(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_start_, stream));
+  }
+  float Stop(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_stop_, stream));
+    CUDA_CALL(cudaEventSynchronize(e_stop_));
+    float elapse_ms = 1.f;
+    CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_));
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+
+ private:
+  cudaEvent_t e_start_, e_stop_;
+};
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/program.cc b/lite/core/program.cc
index b60f279c0fc74904477a080579a799f601e359b0..45796a478b3f2309912e6382b3380bf0734bd6ae 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -122,6 +122,9 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
+#ifdef LITE_WITH_PROFILE
+  LOG(INFO) << "\n" << profiler_.Summary();
+#endif  // LITE_WITH_PROFILE
 }
 
 void Program::Build(const cpp::ProgramDesc& prog) {
@@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
 void Instruction::Run() {
   CHECK(op_) << "op null";
   CHECK(kernel_) << "kernel null";
-#ifdef LITE_WITH_PROFILE
-  if (profile_id_ >= 0) {
-    profile::ProfileBlock x(profile_id_, "instruction");
-  }
-#endif  // LITE_WITH_PROFILE
   if (first_epoch_) {
     first_epoch_ = false;
     CHECK(op_->CheckShape());
diff --git a/lite/core/program.h b/lite/core/program.h
index 7a6700da61f7ba9f35491613d7733b4b637b8ff0..1c1e4975c3a13bcfa9a22999a705f3a78b0fc68e 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -22,9 +22,6 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/model_parser/cpp/program_desc.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#endif  // LITE_WITH_PROFILE
 
 namespace paddle {
 namespace lite {
@@ -87,22 +84,7 @@ struct Program {
 struct Instruction {
   Instruction(const std::shared_ptr<OpLite>& op,
               std::unique_ptr<KernelBase>&& kernel)
-      : op_(op), kernel_(std::move(kernel)) {
-#ifdef LITE_WITH_PROFILE
-    if (op_->Type() != "feed" && op_->Type() != "fetch") {
-      profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
-                        .NewRcd(kernel_->SerializedKernelType())
-                        .id();
-      kernel_->SetProfileID(profile_id_);
-      // Set profile custom info
-      auto& profiler =
-          *profile::BasicProfiler<profile::BasicTimer>::Global().mutable_record(
-              profile_id_);
-      profiler.SetCustomInfo("op_type", op_->Type());
-      profiler.SetCustomInfo("op_info", op_->SerializedOpInfo());
-    }
-#endif  // LITE_WITH_PROFILE
-  }
+      : op_(op), kernel_(std::move(kernel)) {}
 
   // Run the instruction.
   void Run();
@@ -113,6 +95,20 @@ struct Instruction {
   const KernelBase* kernel() const { return kernel_.get(); }
   KernelBase* mutable_kernel() { return kernel_.get(); }
 
+#ifdef LITE_WITH_PROFILE
+  void set_profiler(profile::Profiler* profiler) {
+    profiler_ = profiler;
+    if (op_->Type() != "feed" && op_->Type() != "fetch") {
+      profile::OpCharacter ch;
+      ch.target = kernel()->target();
+      ch.op_type = op_->Type();
+      ch.kernel_name = kernel()->name();
+      profile_id_ = profiler->NewTimer(ch);
+      kernel_->SetProfiler(profiler_, profile_id_);
+    }
+  }
+#endif
+
  private:
   std::shared_ptr<OpLite> op_;
   std::unique_ptr<KernelBase> kernel_;
@@ -120,7 +116,7 @@ struct Instruction {
   bool has_run_{false};
 
 #ifdef LITE_WITH_PROFILE
-  // for profiler
+  profile::Profiler* profiler_;
   int profile_id_{-1};
 #endif  // LITE_WITH_PROFILE
 };
@@ -135,6 +131,9 @@ class LITE_API RuntimeProgram {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
     }
+#ifdef LITE_WITH_PROFILE
+    set_profiler();
+#endif
   }
 
   void Run();
@@ -159,6 +158,15 @@ class LITE_API RuntimeProgram {
   RuntimeProgram(const RuntimeProgram&) = delete;
   std::vector<Instruction> instructions_;
   lite::Scope* exec_scope_{};
+
+#ifdef LITE_WITH_PROFILE
+  profile::Profiler profiler_;
+  void set_profiler() {
+    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
+      i->set_profiler(&profiler_);
+    }
+  }
+#endif
 };
 
 }  // namespace lite
diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def
index 1b5da970e8fa9b2793f7a4982d5ed22ed21e79fd..cc2e593000a414a915ae8f4242b5ea34d6688438 100644
--- a/lite/demo/cxx/Makefile.def
+++ b/lite/demo/cxx/Makefile.def
@@ -1,26 +1,22 @@
 CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
 	      -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
-LDFLAGS = -latomic -pthread -ldl
+LDFLAGS = -latomic -pthread -ldl -llog -lz
 
 SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot
-
-THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
-
+                                    
 SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \
 	          -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \
 	          -I/opt/android-ndk-r17c/sources/android/support/include \
 	          -I/opt/android-ndk-r17c/sysroot/usr/include \
 
-THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
-
 ifeq ($(ARM_ABI), arm8)
     CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ 
-    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
+    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
     CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections 
     SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64
     SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES)
+    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android
 else
     CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
     CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
@@ -31,5 +27,5 @@ else
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES)
+    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi
 endif
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index ec72c044e3fd08bd775b23c373945c5bb5743d1d..b7768d763eb4f6d2255119f805753f96d4bef9e6 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -1,6 +1,6 @@
 # C++ Demo
 1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
-2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
+2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
 3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
 4. 执行以下命令准备模拟器环境
 ```shell
@@ -27,8 +27,10 @@ tar zxvf mobilenet_v1.tar.gz
 make
 adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
 adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
 adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率
 
@@ -37,6 +39,24 @@ adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/da
 cd ../mobile_light
 make
 adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
 adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt"
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt"
 ```
+
+7. 编译并运行目标检测的demo
+```shell
+cd ../mobile_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
+tar zxvf mobilenetv1-ssd.tar.gz
+make
+adb -s emulator-5554 push mobile_detection /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
+```
+运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..784ad73da4bf1d37ee23c17ac7c4dfc5c08f2627
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_detection: fetch_opencv mobile_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_detection.o: mobile_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_detection.o
+	rm -f mobile_detection
diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..2304b38efffdd96e7e13073020df4954b5e53034
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_detection: fetch_opencv mobile_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_detection.o: mobile_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_detection.o
+	rm -f mobile_detection
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
index f795b41d46acc3be67ff6c1a0bba0de1c1d8c82d..8ab8a3b7436c836f681510e28461628ed1038709 100644
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
@@ -5,9 +5,25 @@ include ../Makefile.def
 
 LITE_ROOT=../../../
 
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+
+CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_full_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_full_bundled.a`
+
+#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_full_api: mobilenetv1_full_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
index d0767145b00bd40a3fbeff2aef4f7a0fc6f542d6..c13320603bcce91ebe1fca9014e36b07540abca1 100644
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
@@ -5,9 +5,25 @@ include ../Makefile.def
 
 LITE_ROOT=../../../
 
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+
+CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_full_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_full_bundled.a`
+
+#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_full_api: mobilenetv1_full_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
index d235d6e25fa9abe47ba50d8336cafcdd6580e30d..9150ae6e44e2314a482f7fcb3d139a20cf9f0304 100644
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
@@ -7,7 +7,19 @@ LITE_ROOT=../../../
 
 CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_light_api: mobilenetv1_light_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
index b91aadcef813de2a6f3371fe2cc4989bd87cf1ab..7a2dbdd0fcc9611fe79fb2660ad215ac4ba0d769 100644
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
@@ -7,7 +7,19 @@ LITE_ROOT=../../../
 
 CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_light_api: mobilenetv1_light_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/mobile_detection/mobile_detection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b8f02aeedef991496541400e7db67c3e3ff0e51
--- /dev/null
+++ b/lite/demo/cxx/mobile_detection/mobile_detection.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  int batch_id;
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+const char* class_names[] = {
+    "background", "aeroplane",   "bicycle", "bird",  "boat",
+    "bottle",     "bus",         "car",     "cat",   "chair",
+    "cow",        "diningtable", "dog",     "horse", "motorbike",
+    "person",     "pottedplant", "sheep",   "sofa",  "train",
+    "tvmonitor"};
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img, int width, int height, float* data) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
+  std::vector<float> scale = {0.5f, 0.5f, 0.5f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+std::vector<Object> detect_object(const float* data,
+                                  int count,
+                                  float thresh,
+                                  cv::Mat& image) {  // NOLINT
+  if (data == nullptr) {
+    std::cerr << "[ERROR] data can not be nullptr\n";
+    exit(1);
+  }
+  std::vector<Object> rect_out;
+  for (int iw = 0; iw < count; iw++) {
+    int oriw = image.cols;
+    int orih = image.rows;
+    if (data[1] > thresh && static_cast<int>(data[0]) > 0) {
+      Object obj;
+      int x = static_cast<int>(data[2] * oriw);
+      int y = static_cast<int>(data[3] * orih);
+      int w = static_cast<int>(data[4] * oriw) - x;
+      int h = static_cast<int>(data[5] * orih) - y;
+      cv::Rect rec_clip =
+          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
+      obj.batch_id = 0;
+      obj.class_id = static_cast<int>(data[0]);
+      obj.prob = data[1];
+      obj.rec = rec_clip;
+      if (w > 0 && h > 0 && obj.prob <= 1) {
+        rect_out.push_back(obj);
+        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+        std::string str_prob = std::to_string(obj.prob);
+        std::string text = std::string(class_names[obj.class_id]) + ": " +
+                           str_prob.substr(0, str_prob.find(".") + 4);
+        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double font_scale = 1.f;
+        int thickness = 2;
+        cv::Size text_size =
+            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+        float new_font_scale = w * 0.35 * font_scale / text_size.width;
+        text_size = cv::getTextSize(
+            text, font_face, new_font_scale, thickness, nullptr);
+        cv::Point origin;
+        origin.x = x + 10;
+        origin.y = y + text_size.height + 10;
+        cv::putText(image,
+                    text,
+                    origin,
+                    font_face,
+                    new_font_scale,
+                    cv::Scalar(0, 255, 255),
+                    thickness,
+                    cv::LINE_AA);
+
+        std::cout << "detection, image size: " << image.cols << ", "
+                  << image.rows
+                  << ", detect object: " << class_names[obj.class_id]
+                  << ", score: " << obj.prob << ", location: x=" << x
+                  << ", y=" << y << ", width=" << w << ", height=" << h
+                  << std::endl;
+      }
+    }
+    data += 6;
+  }
+  return rect_out;
+}
+
+void RunModel(std::string model_dir, std::string img_path) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  const int in_width = 300;
+  const int in_height = 300;
+  input_tensor->Resize({1, 3, in_height, in_width});
+  auto* data = input_tensor->mutable_data<float>();
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  pre_process(img, in_width, in_height, data);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
+  std::string result_name =
+      img_path.substr(0, img_path.find(".")) + "_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  RunModel(model_dir, img_path);
+  return 0;
+}
diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6bb36e136deec6088c7b75215fc35d6231283673
Binary files /dev/null and b/lite/demo/cxx/mobile_detection/test.jpg differ
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index 5ac041b2cc53e8f17ad86a2b71e6b02058b7e249..0c9da1a76422edae45dfeec5d38556a5e2322a85 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
-#include <stdio.h>
+#include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
+#include "paddle_api.h"         // NOLINT
+#include "paddle_use_passes.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
@@ -32,11 +30,21 @@ int64_t ShapeProduction(const shape_t& shape) {
   return res;
 }
 
+// 0. Enable OpenCL, if needed
+// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
+// #define DEMO_WITH_OPENCL
 void RunModel() {
   // 1. Set CxxConfig
   CxxConfig config;
   config.set_model_dir(FLAGS_model_dir);
+#ifdef DEMO_WITH_OPENCL
+  std::vector<Place> valid_places{
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kARM), PRECISION(kFloat)}};
+#else
   std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+#endif
   if (FLAGS_prefer_int8_kernel) {
     valid_places.insert(valid_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
@@ -68,14 +76,22 @@ void RunModel() {
   // 6. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "" || FLAGS_optimized_model_dir == "") {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " --model_dir=<your-model-directory>"
+              << " --optimized_model_dir=<your-optmized-model-directory> "
+              << " --prefer_int8_kernel=[true|false]\n";
+    exit(1);
+  }
   RunModel();
   return 0;
 }
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index e1833814cad17b2af182443874c69f4c91e542fc..c40e3d5e9aa1dfc88ca0fae8d14c11b2a6dcbe1d 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -12,27 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gflags/gflags.h>
-#include <stdio.h>
+#include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_api.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
-DEFINE_string(model_dir, "", "Model dir path.");
-
 int64_t ShapeProduction(const shape_t& shape) {
   int64_t res = 1;
   for (auto i : shape) res *= i;
   return res;
 }
 
-void RunModel() {
+void RunModel(std::string model_dir) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir);
+  config.set_model_dir(model_dir);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -52,14 +47,19 @@ void RunModel() {
   // 5. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  RunModel(model_dir);
   return 0;
 }
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index 56c70cf1e1d28dcc1cd6945130520002c8150a8d..40c95415546d99a66abf2d6f3595ae8695c4df86 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -18,7 +18,6 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels}
         EXCLUDE_COMPILE_DEPS "ON"
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
@@ -47,7 +46,6 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
-    BM_DEPS ${bm_kernels}
     EXCLUDE_COMPILE_DEPS "ON"
 )
  
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 8949602cab00c28d03424ad7cca2387765375b80..0c8866eaf88145d3bb0703b32ffb3eaf80332898 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,3 +1,5 @@
+
+# 1. basic kernels for basic models
 # for conv op
 add_kernel(conv_depthwise ARM basic SRCS conv_depthwise.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(conv_direct ARM basic SRCS conv_direct.cc DEPS ${lite_kernel_deps} math_arm)
@@ -14,51 +16,58 @@ add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_
 add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
 add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(unsqueeze_compute_arm ARM extra SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+## 2.other basic kernels: basic kernels that not used in basic models
+add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+## 3. extra kernels
+add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(density_prior_box_compute_arm ARM extra SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_mean_compute_arm ARM extra SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(stack_compute_arm ARM extra SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(affine_channel_compute_arm ARM extra SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(range_compute_arm ARM extra SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
+
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -74,7 +83,7 @@ add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite
 add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
@@ -90,18 +99,17 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
 lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
-lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
-lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
 lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
-lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
-lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
-lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
-
+lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 if(LITE_BUILD_EXTRA)
+    lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
+    lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
+    lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
     lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
+    lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)
 endif()
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index ebb96e21d5e856325b7abdb8342df2aea3d5b5c3..69e507ba347583b3761fe38d86136a22f2576c15 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -32,13 +32,18 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   auto w_dims = param.filter->dims();
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int ic = w_dims[1] * param.groups;
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int pad = param.paddings[0];
+  int pad = paddings[0];
   int stride = param.strides[0];
+  int threads = ctx.threads();
 
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
   int chin = param.x->dims()[1];
   int hin = param.x->dims()[2];
   int win = param.x->dims()[3];
@@ -46,22 +51,28 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   int hout = param.output->dims()[2];
   int wout = param.output->dims()[3];
 
-  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
-                   (param.strides[0] == param.strides[1]) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
+
+  bool kps_equal = (param.strides[0] == param.strides[1]) && (kw == kh);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool flag_dw_3x3 = (kw == 3 && kh == 3 && (stride == 1 || stride == 2));
-  bool flag_dw_5x5 =
-      (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
+  bool flag_dw_5x5 = pads_all_equal && ((kw == 5 && stride == 1) ||
+                                        (kw == 5 && stride == 2 && pad == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   /// select conv impl
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     /// dw conv impl
     impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
     VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
              no_dilation) {
-    if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) {
+    bool use_winograd =
+        (threads == 1 && oc >= 4 && ic >= 4 && hout >= 6 && wout >= 6 &&
+         pads_equal) ||
+        (oc >= 32 && ic >= 32 && hout >= 16 && wout >= 16 && pads_equal);
+    if (use_winograd) {
       /// winograd conv impl
       impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
       VLOG(3) << "invoking winograd conv";
@@ -92,22 +103,29 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
 
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
   int ic = param.groups * w_dims[1];
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
+  int ph = paddings[0];
+  int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
+  bool flag_dw_5x5 = pads_all_equal &&
+                     ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
     VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
@@ -130,23 +148,30 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   auto w_dims = param.filter->dims();
 
   auto& ctx = this->ctx_->template As<ARMContext>();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
 
   int ic = w_dims[1] * param.groups;
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
+  int ph = paddings[0];
+  int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
+  bool flag_dw_5x5 = pads_all_equal &&
+                     ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
     VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
@@ -194,7 +219,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def)
 
 REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -203,7 +228,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
 
 REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -213,7 +238,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
 REGISTER_LITE_KERNEL(
     depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -223,7 +248,7 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
     depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc
index 6a20d607e3a594c8eff83e1f872433f1c6025fd2..e2eaef51ddcb169313e6675d497ca4d7cab438d3 100644
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -31,19 +31,28 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   // select dw conv kernel
   if (kw == 3) {
     VLOG(5) << "invoke 3x3 dw conv fp32";
-    // trans weights
-    constexpr int cblock = 4;
-    auto oc = w_dims[0];
-    auto kh = w_dims[2];
-    auto cround = ROUNDUP(oc, cblock);
-    weights_.Resize({cround, 1, kh, kw});
-    // auto w_data = weights_.mutable_data<float>();
-    // auto w_data_in = param.filter->data<float>();
-    // lite::arm::math::conv_trans_weights_numc(
-    //    w_data_in, w_data, oc, 1, cblock, kh * kw);
-    impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
-    flag_trans_weights_ = false;
-    // flag_trans_weights_ = true;
+    auto paddings = *param.paddings;
+    bool pads_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
+    if (pads_equal && paddings[0] == paddings[2] &&
+        (paddings[0] == 0 || paddings[0] == 1)) {
+      impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+      flag_trans_weights_ = false;
+    } else {
+      // trans weights
+      constexpr int cblock = 4;
+      auto oc = w_dims[0];
+      auto kh = w_dims[2];
+      auto cround = ROUNDUP(oc, cblock);
+      weights_.Resize({cround, 1, kh, kw});
+      auto w_data = weights_.mutable_data<float>();
+      auto w_data_in = param.filter->data<float>();
+      lite::arm::math::conv_trans_weights_numc(
+          w_data_in, w_data, oc, 1, cblock, kh * kw);
+      impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+      flag_trans_weights_ = true;
+    }
   } else if (kw == 5) {
     VLOG(5) << "invoke 5x5 dw conv fp32";
     impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
diff --git a/lite/kernels/arm/conv_gemmlike.h b/lite/kernels/arm/conv_gemmlike.h
index e00b8de6f4a66dfea91e8806821ba7cf3a9aa62b..5e59eb8d1790ab8845df3093ce7d86356b031034 100644
--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
@@ -52,12 +52,19 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
     int oc = o_dims[1];
     int kw = w_dims[3];
     int kh = w_dims[2];
+
+    auto paddings = *param.paddings;
+    auto dilations = *param.dilations;
+
     int sw = param.strides[1];
     int sh = param.strides[0];
-    int pw = param.paddings[1];
-    int ph = param.paddings[0];
-    int dw = param.dilations[1];
-    int dh = param.dilations[0];
+    int pw = paddings[2];
+    int ph = paddings[0];
+    int dw = dilations[1];
+    int dh = dilations[0];
+
+    bool pads_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
 
     int m = oc / param.groups;
     int k = ic * kh * kw / param.groups;
@@ -66,7 +73,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
     bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh);
     bool ks_equal = (sw == sh) && (kw == kh);
     //! select conv gemmlike kernel
-    if (kw == 1 && sw == 1 && pw == 0 && kps_equal) {
+    if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) {
       //! 1x1s1p0 gemmlike conv
       flag_1x1gemm_ = true;
     } else {
diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc
index 5a18499c85d682e0983493869e7d54de81641a99..5c58b297138c0c042bc332e59f5ae7b76e83e779 100644
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
@@ -76,19 +76,28 @@ void Conv2DTransposeCompute::Run() {
   bool fuse_relu = param.fuse_relu;
   bool flag_bias = (param.bias != nullptr);
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int m = chout * kw * kh / group;
   int n = hin * win;
   int k = chin / group;
+
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+
   int group_size_in = win * hin * chin / group;
   int group_size_out = wout * hout * chout / group;
   int group_size_coldata = m * n;
+
+  bool pads_all_qual = pads_equal && (paddings[0] == paddings[2]);
   int hblock = lite::arm::math::get_hblock(&ctx);
   int m_roundup = hblock * ((m + hblock - 1) / hblock);
   int group_size_weights = ((m_roundup * k + 15) / 16) * 16;
   bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) &&
-                      (param.strides[1] == 1) && (param.paddings[0] == 0) &&
-                      (param.paddings[1] == 0) && (param.dilations[0] == 1) &&
-                      (param.dilations[1] == 1);
+                      (param.strides[1] == 1) && pads_all_qual &&
+                      (paddings[0] == 0) && (dilations[0] == 1) &&
+                      (dilations[1] == 1);
   ctx.ExtendWorkspace(sizeof(float) * group * m * n);
 
   auto din = param.x->data<float>();
@@ -129,12 +138,14 @@ void Conv2DTransposeCompute::Run() {
                                      wout,
                                      kh,
                                      kw,
-                                     param.paddings[0],
-                                     param.paddings[1],
+                                     paddings[0],
+                                     paddings[1],
+                                     paddings[2],
+                                     paddings[3],
                                      param.strides[0],
                                      param.strides[1],
-                                     param.dilations[0],
-                                     param.dilations[1],
+                                     dilations[0],
+                                     dilations[1],
                                      dout_batch);
     }
     if (flag_bias) {
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index d1b8d8a48ecd7d564947486ee2938d6b630c41e5..d02cabf277a5e25e2dc731b5bcf0eabe601c9aae 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -26,6 +26,7 @@ template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
+  int threads = ctx.threads();
 
   auto x_dims = param.x->dims();
   auto w_dims = param.filter->dims();
@@ -36,77 +37,97 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   }
 
   int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
+  int ih = x_dims[2];
+  int iw = x_dims[3];
   int oc = o_dims[1];
-  int tile_w = (ow + 5) / 6;
-  int tile_h = (oh + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = ic > oc ? ic : oc;
-
-  const int n_wino = size_tile;
-  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int tile_block = 8;
+#ifdef __aarch64__
+  tile_block = 16;
+#endif
+  int parallel_threads =
+      (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
+  if (threads <= 2 && parallel_threads >= threads) {
+    if (last_kernel_is_c4_ == 1) {
+      return;
+    }
+    last_kernel_is_c4_ = 1;
+    auto pad = *(param.paddings);
+    int pad_h = pad[0];
+    int pad_w = pad[2];
+    int oc_pad = (oc + 3) / 4 * 4;
+    int ic_pad = (ic + 3) / 4 * 4;
+    const int new_input_size =
+        (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+    const int temp_size =
+        (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 256 + 512) * threads;
+    ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+
+    weights_.Resize({1, 1, 1, 64 * oc_pad * ic_pad});
+    ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
+    auto weights_data_ = weights_.mutable_data<float>();
+    lite::arm::math::weight_trans_c4(
+        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
+    free(trans_tmp_ptr);
+  } else {
+    if (last_kernel_is_c4_ == 0) {
+      return;
+    }
+    last_kernel_is_c4_ = 0;
+    int tile_w = (ow + 5) / 6;
+    int tile_h = (oh + 5) / 6;
+
+    int size_tile = tile_h * tile_w;
+    int size_trans_channel = 8 * 8 * size_tile;
+    int max_ch = ic > oc ? ic : oc;
+
+    const int n_wino = size_tile;
+    ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
+                        sizeof(float));
+
+    const int m_wino = oc;
+    int hblock = lite::arm::math::get_hblock(&ctx);
+    int m_round = hblock * ((m_wino + hblock - 1) / hblock);
+    weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
+    ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
+                        sizeof(float));
+    auto weights_wino =
+        static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
+    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
+    lite::arm::math::winograd_transform_weights(
+        weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
+    auto weights_trans = weights_.mutable_data<float>();
+    for (int i = 0; i < 64; ++i) {
+      float* packed_weights = weights_trans + i * m_round * ic;
+      const float* weights_wino_ptr = weights_wino + i * oc * ic;
+      lite::arm::math::prepackA(packed_weights,
+                                weights_wino_ptr,
+                                1.f,
+                                ic,
+                                0,
+                                m_wino,
+                                0,
+                                ic,
+                                false,
+                                &ctx);
+    }
+    free(trans_tmp_ptr);
+    free(weights_wino);
+  }
   last_shape_ = x_dims;
 }
 
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  last_shape_ = x_dims;
-
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int tile_w = (ow + 5) / 6;
-  int tile_h = (oh + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = ic > oc ? ic : oc;
-
-  const int m_wino = oc;
-  const int n_wino = size_tile;
-  int hblock = lite::arm::math::get_hblock(&ctx);
-  int m_round = hblock * ((m_wino + hblock - 1) / hblock);
-  weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
-  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
-  auto weights_wino =
-      static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
-  void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
-  lite::arm::math::winograd_transform_weights(
-      weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
-  auto weights_trans = weights_.mutable_data<float>();
-  for (int i = 0; i < 64; ++i) {
-    float* packed_weights = weights_trans + i * m_round * ic;
-    const float* weights_wino_ptr = weights_wino + i * oc * ic;
-    lite::arm::math::prepackA(packed_weights,
-                              weights_wino_ptr,
-                              1.f,
-                              ic,
-                              0,
-                              m_wino,
-                              0,
-                              ic,
-                              false,
-                              &ctx);
-  }
-  free(trans_tmp_ptr);
-  free(weights_wino);
+  ReInitWhenNeeded();
 }
 
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
-  // extend workspace
-  ctx.ExtendWorkspace(workspace_size_);
-
   const auto* i_data = param.x->data<float>();
   const auto* w_data = weights_.data<float>();
   const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
@@ -124,8 +145,42 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   int ow = o_dims[3];
   int oc = o_dims[1];
 
-  lite::arm::math::conv_winograd3x3(
-      i_data, o_data, bs, oc, oh, ow, ic, ih, iw, w_data, b_data, param, &ctx);
+  int tile_block = 8;
+#ifdef __aarch64__
+  tile_block = 16;
+#endif
+  int threads = ctx.threads();
+  int parallel_threads =
+      (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
+  if (threads <= 2 && parallel_threads >= threads) {
+    lite::arm::math::conv_compute_6x6_3x3(i_data,
+                                          o_data,
+                                          bs,
+                                          oc,
+                                          oh,
+                                          ow,
+                                          ic,
+                                          ih,
+                                          iw,
+                                          w_data,
+                                          b_data,
+                                          param,
+                                          &ctx);
+  } else {
+    lite::arm::math::conv_winograd3x3(i_data,
+                                      o_data,
+                                      bs,
+                                      oc,
+                                      oh,
+                                      ow,
+                                      ic,
+                                      ih,
+                                      iw,
+                                      w_data,
+                                      b_data,
+                                      param,
+                                      &ctx);
+  }
 }
 
 }  // namespace arm
diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h
index 33f0edc017adca477b2e71964efdcaddb0ca3a08..40ea54b2918ad6c1b18d36a6df287c7e3eb312a6 100644
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -40,6 +40,7 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
   Tensor weights_;
   DDim last_shape_;
   int workspace_size_{0};
+  int last_kernel_is_c4_{-1};
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
index 1983c733180143dc0c715d6c8e3c4fddac6f8418..525eca269bae22d27d078f6696efcfb8566270c5 100644
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -127,7 +127,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                              k_,
                              param.bias != nullptr,
                              b_data,
-                             false);
+                             false,
+                             &ctx);
     }
   }
 }
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
index 0b1911abf4fe553b670cf21dbb519c24dc08f184..05d43dddec47a303a89a2d48b3fb91ff45e6e2c0 100644
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -25,6 +25,38 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   using param_t = operators::FillConstantParam;
 
+  inline DDimLite GetShape(const param_t& param) {
+    // 1. shape is a Tensor
+    if (param.shape_tensor != nullptr) {
+      auto* shape_tensor = param.shape_tensor;
+      auto* shape_data = shape_tensor->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
+      return DDimLite(vec_shape);
+    }
+
+    // 2. shape is a list/tuple containing Tensor
+    auto shape_tensor_list = param.shape_tensor_list;
+    if (shape_tensor_list.size() > 0) {
+      std::vector<int64_t> vec_shape;
+      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
+        auto tensor = shape_tensor_list[i];
+        vec_shape.push_back(*tensor->data<int>());
+      }
+      return DDimLite(vec_shape);
+    }
+
+    // 3. shape is a list/tuple without containing Tensor
+    auto vec_shape = param.shape;
+    return DDimLite(vec_shape);
+  }
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto outdims = GetShape(param);
+    param.Out->Resize(outdims);
+  }
+
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<ARMContext>();
@@ -107,6 +139,11 @@ REGISTER_LITE_KERNEL(fill_constant,
                      kNCHW,
                      paddle::lite::kernels::arm::FillConstantCompute<float>,
                      def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("ShapeTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 REGISTER_LITE_KERNEL(
diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc
index a26777826db6976c755fac7798880871f407c12d..0398dabeaee4c042b33ac5572b783b126bc8ddb4 100644
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
@@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() {
   auto& param = Param<operators::InterpolateParam>();
   lite::Tensor* X = param.X;
   lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
   lite::Tensor* Out = param.Out;
   float scale = param.scale;
   int out_w = param.out_w;
@@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() {
   std::string interp_method = "Bilinear";
   lite::arm::math::interpolate(X,
                                OutSize,
+                               SizeTensor,
+                               Scale,
                                Out,
                                out_h,
                                out_w,
                                scale,
-                               scale,
                                align_corners,
                                interp_method);
 }
@@ -49,6 +52,8 @@ void NearestInterpCompute::Run() {
   auto& param = Param<operators::InterpolateParam>();
   lite::Tensor* X = param.X;
   lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
   lite::Tensor* Out = param.Out;
   float scale = param.scale;
   int out_w = param.out_w;
@@ -57,11 +62,12 @@ void NearestInterpCompute::Run() {
   std::string interp_method = "Nearest";
   lite::arm::math::interpolate(X,
                                OutSize,
+                               SizeTensor,
+                               Scale,
                                Out,
                                out_h,
                                out_w,
                                scale,
-                               scale,
                                align_corners,
                                interp_method);
 }
@@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
@@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc52c5ea3ee452033cfd3c7d559cb88b21ca48f6
--- /dev/null
+++ b/lite/kernels/arm/layout_compute.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/layout_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+#define NCHWTONHWC(type)                                                  \
+  auto& param = this->template Param<param_t>();                          \
+  auto input = param.x->template data<type>();                            \
+  auto input_dim = param.x->dims();                                       \
+  CHECK(input_dim.size() == 4)                                            \
+      << "NCHW to NHWC should guarantee that the input dims should be 4"; \
+  int n = input_dim[0];                                                   \
+  int c = input_dim[1];                                                   \
+  int h = input_dim[2];                                                   \
+  int w = input_dim[3];                                                   \
+  param.y->Resize({n, h, w, c});                                          \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
+  if (c == 1) {                                                           \
+    memcpy(output, input, sizeof(type) * n * h * w);                      \
+    return;                                                               \
+  }                                                                       \
+  lite::arm::math::NCHW2NHWC<type>(n, c, h * w, input, output);
+
+#define NHWCTONCHW(type)                                                  \
+  auto& param = this->template Param<param_t>();                          \
+  auto input = param.x->template data<type>();                            \
+  auto input_dim = param.x->dims();                                       \
+  CHECK(input_dim.size() == 4)                                            \
+      << "NHWC to NCHW should guarantee that the input dims should be 4"; \
+  int n = input_dim[0];                                                   \
+  int h = input_dim[1];                                                   \
+  int w = input_dim[2];                                                   \
+  int c = input_dim[3];                                                   \
+  param.y->Resize({n, c, h, w});                                          \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
+  if (c == 1) {                                                           \
+    memcpy(output, input, sizeof(type) * n * h * w);                      \
+    return;                                                               \
+  }                                                                       \
+  lite::arm::math::NHWC2NCHW<type>(n, c, h * w, input, output);
+
+template <>
+void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
+  NCHWTONHWC(float);
+}
+
+template <>
+void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
+  NCHWTONHWC(int8_t);
+}
+
+template <>
+void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
+  NHWCTONCHW(float);
+}
+
+template <>
+void NHWCToNCHWCompute<PRECISION(kInt8)>::Run() {
+  NHWCTONCHW(int8_t);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kFloat)>
+    NCHW_fp32;
+typedef paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kInt8)>
+    NCHW_int8;
+typedef paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kFloat)>
+    NHWC_fp32;
+typedef paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kInt8)>
+    NHWC_int8;
+
+REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/arm/layout_compute.h b/lite/kernels/arm/layout_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..13b8621029437ea18d960e9c22d53b7062983b8f
--- /dev/null
+++ b/lite/kernels/arm/layout_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+template <PrecisionType Ptype>
+class NCHWToNHWCCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override;
+  virtual ~NCHWToNHWCCompute() = default;
+};
+
+template <PrecisionType Ptype>
+class NHWCToNCHWCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override;
+  virtual ~NHWCToNCHWCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
index fa7e2c0c3ae4580f5d19e82f7c48c74db3058847..ba58b378f4dda22fd78ce76b80bdbca8d8f284a3 100644
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,7 +28,6 @@ namespace arm {
 
 void LookupTableCompute::Run() {
   auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
   // inputs
   auto w = param.W;
   auto ids = param.Ids;
@@ -37,7 +36,7 @@ void LookupTableCompute::Run() {
 
   auto table_dim = w->dims();
   int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<float>();
+  auto ids_data = ids->data<int64_t>();
 
   int64_t row_number = table_dim[0];
   int64_t row_width = table_dim[1];
@@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/lookup_table_compute_test.cc b/lite/kernels/arm/lookup_table_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..78748edf39c43c5451f8fa3c4d63bde7405c7078
--- /dev/null
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void lookup_table_compute_ref(const operators::LookupTableParam &param) {
+  auto *ids_t = param.Ids;
+  auto *output_t = param.Out;
+  int64_t padding_idx = param.padding_idx;
+  auto *ids = ids_t->data<int64_t>();
+  int64_t ids_numel = ids_t->dims().production();
+
+  auto *table_t = param.W;
+  int64_t row_number = table_t->dims()[0];
+  int64_t row_width = table_t->dims()[1];
+
+  auto *table = table_t->data<float>();
+  auto *output = output_t->mutable_data<float>();
+  memset(output, 0, output_t->dims().production() * sizeof(float));
+  for (int64_t i = 0; i < ids_numel; ++i) {
+    if (padding_idx != -1 && ids[i] == padding_idx) {
+      memset(output + i * row_width, 0, row_width * sizeof(float));
+    } else {
+      CHECK_LT(ids[i], row_number);
+      CHECK_GE(ids[i], 0);
+      memcpy(output + i * row_width,
+             table + ids[i] * row_width,
+             row_width * sizeof(float));
+    }
+  }
+}
+
+TEST(lookup_table_arm, retrieve_op) {
+  auto lookup_table =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "lookup_table");
+  ASSERT_FALSE(lookup_table.empty());
+  ASSERT_TRUE(lookup_table.front());
+}
+
+TEST(lookup_table_arm, init) {
+  LookupTableCompute lookup_table;
+  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
+  ASSERT_EQ(lookup_table.target(), TARGET(kARM));
+}
+
+TEST(lookup_table_arm, compute) {
+  LookupTableCompute lookup_table;
+  operators::LookupTableParam param;
+  lite::Tensor w, ids, out, out_ref;
+  int64_t padding_idx = -1;
+
+  auto w_dim = DDim(std::vector<int64_t>({4, 5}));
+  auto ids_dim = DDim(std::vector<int64_t>({3, 2}));
+  auto out_dim = DDim(std::vector<int64_t>({3, 2, 5}));
+
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto *w_data = w.mutable_data<float>();
+  auto *ids_data = ids.mutable_data<int64_t>();
+  auto *out_data = out.mutable_data<float>();
+  auto *out_ref_data = out_ref.mutable_data<float>();
+
+  int w_num = w_dim.production();
+  for (int i = 0; i < w_num; i++) {
+    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  int ids_num = ids_dim.production();
+  for (int i = 0; i < ids_num; i++) {
+    ids_data[i] = i % 4;
+  }
+  int out_num = out_dim.production();
+
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  lookup_table.SetParam(param);
+  lookup_table.Run();
+  param.Out = &out_ref;
+  lookup_table_compute_ref(param);
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/lrn_compute.cc b/lite/kernels/arm/lrn_compute.cc
index 18e6654282c8810a8310e540c2851fecb116f2d8..0476b1e6bde99e7993d1b0feb53ab10ba1b8f9b5 100644
--- a/lite/kernels/arm/lrn_compute.cc
+++ b/lite/kernels/arm/lrn_compute.cc
@@ -31,16 +31,16 @@ void LrnCompute::Run() {
   int channel = x_dims[1];
   int h = x_dims[2];
   int w = x_dims[3];
-  const int local_size = param.local_size;
+  const int n = param.n;
   const float alpha = param.alpha;
   const float beta = param.beta;
   const float k = param.k;
   if (param.norm_region == "AcrossChannels") {
     lite::arm::math::compute_across_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
+        x_data, out_data, num, channel, h, w, n, alpha, beta, k);
   } else {
     lite::arm::math::compute_within_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
+        x_data, out_data, num, channel, h, w, n, alpha, beta, k);
   }
 }
 
@@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL(
     lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc
index 8e030006151c5834a68037800192ec7d9bc5d94d..e7030d00427e55c7faf333997cd90cba46260cd4 100644
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
@@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) {
   const dtype* x_data = param.X->data<const dtype>();
   dtype* out_data = param.Out->mutable_data<dtype>();
   auto x_dims = param.X->dims();
-  int local_size = param.local_size;
+  int local_size = param.n;
   float alpha = param.alpha;
   float beta = param.beta;
   float k = param.k;
@@ -171,7 +171,7 @@ TEST(lrn_arm, compute) {
           }
           param.X = &x;
           param.Out = &output;
-          param.local_size = local_size;
+          param.n = local_size;
           param.alpha = alpha;
           param.beta = beta;
           param.k = k;
diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc
index 29be34d0c273abe9fafb7d187cc6d443eefc2d55..d00a5bdc060431509b73b18336f41b3c688cbcf2 100644
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
@@ -232,7 +232,7 @@ void MatMulCompute::Run() {
       int ldc = n_;
       if (n_ == 1) {
         lite::arm::math::sgemv(
-            x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+            x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
         if (fabsf(alpha - 1.f) > 1e-8f) {
           for (size_t i = 0; i < param.Out->dims().production(); ++i) {
             o_data[i] *= alpha;
diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc
index fa43b6cf8e5d7418583d44d2ed9b6e49d128d2d6..debe9e907cadafd67e6be40f7e49ff12cb4d527e 100644
--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
@@ -48,14 +48,13 @@ void MulCompute::Run() {
 
   CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
   k_ = x_w;
-
+  auto& ctx = this->ctx_->template As<ARMContext>();
   if (n_ == 1) {
     lite::arm::math::sgemv(
-        x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+        x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
 
   } else {
     constexpr bool is_tranposed_y = false;
-    auto& ctx = this->ctx_->template As<ARMContext>();
     int hblock = lite::arm::math::get_hblock(&ctx);
     int m_round = hblock * ((m_ + hblock - 1) / hblock);
     ctx.ExtendWorkspace(m_round * k_ * sizeof(float));
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
index 9f02a462a517077f662dcc952780b6e34bfb95a4..c9f0fed47854226327be86a02a9429a003fe4762 100644
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -38,7 +38,7 @@ void PoolCompute::Run() {
 
   std::vector<int>& ksize = param.ksize;
   std::vector<int>& strides = param.strides;
-  std::vector<int>& paddings = param.paddings;
+  std::vector<int>& paddings = *param.paddings;
 
   std::string& pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -48,12 +48,15 @@ void PoolCompute::Run() {
   bool use_quantizer = param.use_quantizer;
   std::string& data_format = param.data_format;
 
-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
-                   (paddings[0] == paddings[1]);
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
 
+  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
+                   (paddings[0] == paddings[2]);
   if (global_pooling) {
     for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(in_dims[i + 2]);
     }
     if (pooling_type == "max") {
@@ -80,7 +83,8 @@ void PoolCompute::Run() {
       return;
     }
   } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
+    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal &&
+        kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling2x2s2_max(din,
                                           dout,
@@ -106,7 +110,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s1p1_max(din,
                                             dout,
@@ -132,7 +136,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s1p0_max(din,
                                             dout,
@@ -158,7 +162,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s2p0_max(din,
                                             dout,
@@ -184,7 +188,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s2p1_max(din,
                                             dout,
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
index 79e5332172c9a488c83dd485f094250d71a1d5dc..7ed8a142dda06e2d1b8f9d8afdade0194d87d1e6 100644
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/arm/pool_compute.h"
 #include <gtest/gtest.h>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
@@ -25,14 +26,21 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
@@ -40,10 +48,12 @@ int PoolOutputSize(
 std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
   const auto x_dims = param_->x->dims();
   std::vector<int>& ksize = param_->ksize;
+  auto paddings = *param_->paddings;
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
@@ -56,7 +66,8 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
@@ -73,7 +84,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -99,7 +110,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -178,18 +189,22 @@ void pool_compute_ref(const operators::PoolParam& param) {
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -225,75 +240,92 @@ TEST(pool_arm, compute) {
         for (auto exclusive : {true, false}) {
           for (auto ksize : {2, 3}) {
             for (auto stride : {1, 2}) {
-              for (auto pad : {0, 1}) {
-                for (auto n : {1, 2}) {
-                  for (auto c : {1, 3}) {
+              for (auto pad_left : {0, 1}) {
+                for (auto pad_right : {0, 1}) {
+                  for (auto pad_top : {0, 1}) {
+                    for (auto pad_bottom : {0, 1}) {
+                      for (auto n : {1, 2}) {
+                        for (auto c : {1, 3}) {
 #if 1
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
+                          for (auto h : {2, 3, 4, 11}) {
+                            for (auto w : {2, 3, 4, 11}) {
 #else
-                    for (int h = 2; h < 25; h++) {
-                      for (int w = 2; w < 25; w++) {
+                          for (int h = 2; h < 25; h++) {
+                            for (int w = 2; w < 25; w++) {
 #endif
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                              VLOG(3) << "n:" << n << " c:" << c << " h:" << h
+                                      << " w:" << w << " ksize:" << ksize
+                                      << " stride:" << stride
+                                      << " pad_left:" << pad_left
+                                      << " pad_right:" << pad_right
+                                      << " pad_top:" << pad_top
+                                      << " pad_bottom:" << pad_bottom
+                                      << " exclusive:" << exclusive
+                                      << " global_pooling:" << global_pooling
+                                      << " ceil_mode: " << ceil_mode
+                                      << " pooling_type:" << pooling_type;
 
-                        // init x, output
-                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                        auto* x_data = x.mutable_data<float>();
-                        for (int i = 0; i < x.dims().production(); ++i) {
-                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
-                          x_data[i] = sign * (i % 128);
-                        }
+                              // init x, output
+                              x.Resize(
+                                  DDim(std::vector<int64_t>({n, c, h, w})));
+                              auto* x_data = x.mutable_data<float>();
+                              for (int i = 0; i < x.dims().production(); ++i) {
+                                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                                x_data[i] = sign * (i % 128);
+                              }
 
-                        // fill param
-                        param.x = &x;
-                        param.output = &output;
-                        param.pooling_type = pooling_type;
-                        if (global_pooling) {
-                          param.ksize = {h, w};
-                        } else {
-                          param.ksize = {ksize, ksize};
-                        }
-                        param.global_pooling = global_pooling;
-                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
-                        param.exclusive = exclusive;
-                        param.ceil_mode = ceil_mode;
-                        param.adaptive = false;
-                        param.use_quantizer = false;
+                              // fill param
+                              param.x = &x;
+                              param.output = &output;
+                              param.pooling_type = pooling_type;
+                              if (global_pooling) {
+                                param.ksize = {h, w};
+                              } else {
+                                param.ksize = {ksize, ksize};
+                              }
+                              param.global_pooling = global_pooling;
+                              param.strides = {stride, stride};
+                              std::vector<int> paddings = {
+                                  pad_top, pad_bottom, pad_left, pad_right};
+                              param.exclusive = exclusive;
+                              param.paddings =
+                                  std::make_shared<std::vector<int>>(paddings);
+                              param.ceil_mode = ceil_mode;
+                              param.adaptive = false;
+                              param.use_quantizer = false;
 
-                        const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
-                        output.Resize(DDim(output_shape));
-                        output_ref.Resize(DDim(output_shape));
+                              const std::vector<int64_t>& output_shape =
+                                  compute_output_shape(&param);
+                              output.Resize(DDim(output_shape));
+                              output_ref.Resize(DDim(output_shape));
 
-                        auto* output_data = output.mutable_data<float>();
-                        auto* output_ref_data =
-                            output_ref.mutable_data<float>();
-                        for (int i = 0; i < output.dims().production(); ++i) {
-                          output_data[i] = -2;
-                          output_ref_data[i] = -2;
-                        }
+                              auto* output_data = output.mutable_data<float>();
+                              auto* output_ref_data =
+                                  output_ref.mutable_data<float>();
+                              for (int i = 0; i < output.dims().production();
+                                   ++i) {
+                                output_data[i] = -2;
+                                output_ref_data[i] = -2;
+                              }
 
-                        // compute
-                        pool.SetParam(param);
-                        pool.Run();
+                              // compute
+                              pool.SetParam(param);
+                              pool.Run();
 
-                        // compute ref
-                        param.output = &output_ref;
-                        pool_compute_ref(param);
+                              // compute ref
+                              param.output = &output_ref;
+                              pool_compute_ref(param);
 
-                        // compare
-                        for (int i = 0; i < output.dims().production(); i++) {
-                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+                              // compare
+                              for (int i = 0; i < output.dims().production();
+                                   i++) {
+                                EXPECT_NEAR(
+                                    output_data[i], output_ref_data[i], 1e-4);
+                              }
+                              VLOG(3) << "compare pass";
+                            }
+                          }
                         }
-                        VLOG(3) << "compare pass";
                       }
                     }
                   }
diff --git a/lite/kernels/arm/split_compute.cc b/lite/kernels/arm/split_compute.cc
index 27606e2d76dfd13161fffc3f53d614155f62254e..2a0c52e7fc44cdd7c36ac3e8f93b33731f03bd77 100644
--- a/lite/kernels/arm/split_compute.cc
+++ b/lite/kernels/arm/split_compute.cc
@@ -42,5 +42,9 @@ void SplitCompute::Run() {
 REGISTER_LITE_KERNEL(
     split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SectionsTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index b33fc8f6bb0a5616ab87c01d55f9d81a9fe7032b..4bf1cbf5210214befb3620f8b7d70923f41f98f2 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -5,24 +5,39 @@ endif()
 message(STATUS "compile with lite CUDA kernels")
 
 add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} context)
+add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
 add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(concat_compute_cuda CUDA basic SRCS concat_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(elementwise_add_compute_cuda CUDA basic SRCS elementwise_add_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise)
+add_kernel(elementwise_compute_cuda CUDA basic SRCS elementwise_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise)
 add_kernel(calib_compute_cuda CUDA basic SRCS calib_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(layout_compute_cuda CUDA basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} cuda_transpose)
 add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS
+${lite_kernel_deps} cudnn_pool)
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_fc_compute_cuda CUDA basic SRCS search_fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(sequence_topk_avg_pooling_compute_cuda CUDA basic SRCS sequence_topk_avg_pooling_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
+add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 
 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
 nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
@@ -31,13 +46,28 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
+nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda)
 nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
-nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
+nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda)
 nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
-nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
+nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) 
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
+nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
+nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
+nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
+nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
+nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
+nv_test(search_fc_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
+nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
+
 if(LITE_BUILD_EXTRA)
+    nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
+    nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
+    nv_test(search_grnn_compute_cuda_test SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_cuda)
+    nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
     nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
+    nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
+    nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
 endif()
diff --git a/lite/kernels/cuda/attention_padding_mask_compute.cu b/lite/kernels/cuda/attention_padding_mask_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fac73b1adc49fd90fbda33669aee53e4126a6649
--- /dev/null
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
@@ -0,0 +1,162 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/attention_padding_mask_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+#define CUDA_NUM_THREADS 256
+
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ker_attention_padding_mask(T* out_data,
+                                           const T* attn_data,
+                                           const int* src_offset,
+                                           const int attn_seq_num,
+                                           const int attn_seq_len,
+                                           const int src_seq_num,
+                                           const int src_seq_len,
+                                           const T* pad_begin_data,
+                                           const T mask,
+                                           const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int src_word_id = tid % src_seq_len;
+    int tmp_tid = tid / src_seq_len;
+    int attn_seq_id = tmp_tid / attn_seq_len;
+    int attn_word_id = tmp_tid % attn_seq_len;
+    int src_seq_id = attn_seq_id % src_seq_num;
+    int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id];
+
+    int k = static_cast<int>(pad_begin_data[src_seq_id]);
+    if (k < cur_len &&
+        tid >= src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + k &&
+        tid < src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) +
+                  cur_len) {
+      out_data[tid] = mask;
+    } else {
+      out_data[tid] = attn_data[tid];
+    }
+  }
+}
+
+void AttentionPaddingMaskCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto attn = param.X;
+  auto src = param.Y;
+  const int count = attn->numel();
+  auto attn_offset = attn->lod()[0];
+  auto src_offset = src->lod()[0];
+  const int attn_seq_num = attn_offset.size() - 1;
+  const int attn_seq_len = attn_offset[1];
+  const int src_seq_num = src_offset.size() - 1;
+  const int src_seq_len = count / attn->dims()[0];
+
+  auto out = param.Out;
+  out->Resize(attn->dims());
+  out->set_lod(attn->lod());
+
+  auto attn_data = attn->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+
+  std::vector<float> src_cpu(src->numel(), 0);
+  TargetWrapperCuda::MemcpyAsync(src_cpu.data(),
+                                 src->data<float>(),
+                                 sizeof(float) * src->numel(),
+                                 IoDirection::DtoH,
+                                 stream);
+  cudaStreamSynchronize(stream);
+
+  std::vector<float> pad_begin(src_seq_num, 0);
+  auto src_len = static_cast<int64_t>(src->lod()[0][1]);
+  int _pad_id = param.pad_id;
+  for (int i = 0; i < src_seq_num; ++i) {
+    const auto* src_data = src_cpu.data() + src_len * i;
+    int index = src_len - 1;
+    for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+         --index) {
+    }
+    pad_begin[i] = static_cast<float>(index + 1);
+  }
+
+  param.pad_begin->Resize({static_cast<int64_t>(src_seq_num)});
+  auto pad_begin_cuda_data =
+      param.pad_begin->mutable_data<float>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data,
+                                 pad_begin.data(),
+                                 sizeof(float) * src_seq_num,
+                                 IoDirection::HtoD,
+                                 stream);
+
+  std::vector<int> src_offset_cpu(src_offset.size(), 0);
+  for (int i = 0; i < src_offset.size(); i++) {
+    src_offset_cpu[i] = src_offset[i];
+  }
+
+  src_offset_cuda.Resize({static_cast<int64_t>(src_offset.size())});
+  auto src_offset_cuda_data = src_offset_cuda.mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(src_offset_cuda_data,
+                                 src_offset_cpu.data(),
+                                 sizeof(int) * src_offset.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  ker_attention_padding_mask<
+      float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      out_data,
+      attn_data,
+      src_offset_cuda_data,
+      attn_seq_num,
+      attn_seq_len,
+      src_seq_num,
+      src_seq_len,
+      pad_begin_cuda_data,
+      param.mask,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_attention_padding_mask,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::AttentionPaddingMaskCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/attention_padding_mask_compute.h b/lite/kernels/cuda/attention_padding_mask_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..57d8c269a1cdc1f6dcd59bf3399a835b64b6784c
--- /dev/null
+++ b/lite/kernels/cuda/attention_padding_mask_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AttentionPaddingMaskCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AttentionPaddingMaskParam;
+
+  void Run() override;
+  virtual ~AttentionPaddingMaskCompute() = default;
+
+ private:
+  lite::Tensor src_offset_cuda;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/attention_padding_mask_compute_test.cc b/lite/kernels/cuda/attention_padding_mask_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d11858350d6fd49ef12cfe5d7ebb7ed865ee51d7
--- /dev/null
+++ b/lite/kernels/cuda/attention_padding_mask_compute_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/attention_padding_mask_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void attention_padding_mask_ref(
+    const Tensor& x,
+    const Tensor& y,
+    Tensor* out,
+    Tensor* pad_begin,
+    const operators::AttentionPaddingMaskParam& param) {
+  auto attn_offset = x.lod()[0];
+  auto src_offset = y.lod()[0];
+  int attn_seq_num = attn_offset.size() - 1;
+  int src_seq_num = src_offset.size() - 1;
+  int attn_seq_len = attn_offset[1];
+  int src_seq_len = x.dims()[1];
+  CHECK_EQ(attn_seq_num % src_seq_num, 0);
+
+  auto count = x.numel();
+  auto attn_data = x.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  memcpy(out_data, attn_data, count * sizeof(float));
+
+  for (int i = 0; i < attn_seq_num; ++i) {
+    for (int j = 0; j < attn_seq_len; ++j) {
+      auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
+      int src_seq_idx = i % src_seq_num;
+      int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
+      for (int k = cur_len; k < src_seq_len; k++) {
+        tmp_out_data[k] = param.mask;
+      }
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
+  std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
+  x->Resize(x_dims);
+  x->set_lod(lod);
+  auto x_data = x->mutable_data<float>();
+  auto x_num = x->numel();
+  for (int i = 0; i < x_num; i++) {
+    x_data[i] = (i - x_num) * 1.1;
+  }
+}
+
+int get_max_len(const LoD& lod) {
+  int max_len = 0;
+  auto offset = lod[0];
+  for (int i = 0; i < offset.size() - 1; i++) {
+    int cur_len = offset[i + 1] - offset[i];
+    max_len = max_len < cur_len ? cur_len : max_len;
+  }
+  return max_len;
+}
+
+TEST(attention_padding_mask_cuda, run_test) {
+  lite::Tensor x, y, x_cpu, y_cpu;
+  lite::Tensor out, pad_begin, out_cpu, out_ref, pad_begin_ref;
+
+  LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
+  prepare_input(&x_cpu, x_lod, get_max_len(y_lod));
+  prepare_input(&y_cpu, y_lod, 1);
+
+  x.Resize(x_cpu.dims());
+  x.set_lod(x_cpu.lod());
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  y.Resize(y_cpu.dims());
+  y.set_lod(y_cpu.lod());
+
+  operators::AttentionPaddingMaskParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.pad_id = 12800001;
+  param.mask = -90000000.f;
+  param.Out = &out;
+  param.pad_begin = &pad_begin;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  AttentionPaddingMaskCompute attention_padding_mask_kernel;
+  attention_padding_mask_kernel.SetParam(param);
+  attention_padding_mask_kernel.SetContext(std::move(ctx));
+  attention_padding_mask_kernel.Run();
+  cudaDeviceSynchronize();
+
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  out_cpu.Resize(out.dims());
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  attention_padding_mask_ref(x_cpu, y_cpu, &out_ref, &pad_begin_ref, param);
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/bilinear_interp_compute.cu b/lite/kernels/cuda/bilinear_interp_compute.cu
index 7e1dbaf228c31d8123e48832e93e0180c4920359..00b14579383b67eccd65600869a156ba68d8cb09 100644
--- a/lite/kernels/cuda/bilinear_interp_compute.cu
+++ b/lite/kernels/cuda/bilinear_interp_compute.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+#include "lite/backends/cuda/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/bilinear_interp_compute.h"
 
@@ -20,6 +21,43 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    lite::Tensor temp;
+    auto temp_data = temp.mutable_data<float>();
+    auto tensor_data = tensor->data<float>();
+    cudaMemcpy(temp_data,
+               tensor_data,
+               tensor->dims().production() * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    vec_new_shape.push_back(static_cast<int32_t>(*temp_data));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data<T>();
+  cudaMemcpy(cpu_starts_tensor_data,
+             new_data,
+             new_data_tensor->dims().production() * sizeof(T),
+             cudaMemcpyDeviceToHost);
+
+  auto new_data_ = cpu_starts_tensor.data<T>();
+  vec_new_data = std::vector<T>(
+      new_data_, new_data_ + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 template <typename T>
 __global__ void BilinearInterp(const T* in,
                                const size_t in_img_h,
@@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() {
   int out_w = param.out_w;
   float scale = param.scale;
   bool align_corners = param.align_corners;
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
 
-  if (out_size != nullptr) {
-    Tensor sizes;
-    float* size_data = sizes.mutable_data<float>();
-    float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
-    cudaMemcpy(
-        size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
-    out_h = static_cast<int>(size_data[0]);
-    out_w = static_cast<int>(size_data[1]);
+  auto list_new_shape_tensor = param.SizeTensor;
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    auto scale_tensor = param.Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+    if (out_size != nullptr) {
+      lite::Tensor sizes;
+      float* size_data = sizes.mutable_data<float>();
+      float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
+      cudaMemcpy(
+          size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+      out_h = static_cast<int>(size_data[0]);
+      out_w = static_cast<int>(size_data[1]);
+    }
   }
 
   auto output_data = output->mutable_data<float>(TARGET(kCUDA));
-
   if (in_h == out_h && in_w == out_w) {
     cudaMemcpy(output_data,
                input_data,
@@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                {LiteType::GetTensorTy(TARGET(kCUDA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scale",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kCUDA),
                                        PRECISION(kFloat),
diff --git a/lite/kernels/cuda/bilinear_interp_compute_test.cc b/lite/kernels/cuda/bilinear_interp_compute_test.cc
index e7e8143150d2963fb4cb74c3530cfd6e125a454c..e93f5b1f3e8d6f5d93af2571a10d9fc531605f9c 100644
--- a/lite/kernels/cuda/bilinear_interp_compute_test.cc
+++ b/lite/kernels/cuda/bilinear_interp_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace paddle {
 namespace lite {
@@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) {
   }
 }
 
+TEST(bilinear_interp, update) {
+  BilinearInterpCompute bilinear_interp_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::InterpolateParam param;
+
+  std::vector<Tensor> size_tensor(2);
+  std::vector<Tensor> size_tensor_cpu(2), size_tensor_ref(2);
+  Tensor x, input_scale, osz, out;
+  Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu;
+  Tensor x_ref, input_scale_ref, osz_ref, out_ref;
+
+  int n = 1, c = 1, in_h = 3, in_w = 3;
+  int out_h = 6, out_w = 6;
+  float scale = 2.0;
+
+  param.out_h = out_h;
+  param.out_w = out_w;
+  param.scale = scale;
+  param.align_corners = false;
+  param.align_mode = 0;
+
+  x.Resize({n, c, in_h, in_w});
+  size_tensor[0].Resize({1});
+  size_tensor[1].Resize({1});
+  input_scale.Resize({1});
+  osz.Resize({2});
+  out.Resize({n, c, out_h, out_w});
+
+  x_cpu.Resize({n, c, in_h, in_w});
+  size_tensor_cpu[0].Resize({1});
+  size_tensor_cpu[1].Resize({1});
+  input_scale_cpu.Resize({1});
+  osz_cpu.Resize({2});
+  out_cpu.Resize({n, c, out_h, out_w});
+
+  x_ref.Resize({n, c, in_h, in_w});
+  size_tensor_ref[0].Resize({1});
+  size_tensor_ref[1].Resize({1});
+  input_scale_ref.Resize({1});
+  osz_ref.Resize({2});
+  out_ref.Resize({n, c, out_h, out_w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data<float>();
+  float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data<float>();
+  float* input_scale_cpu_data = input_scale_cpu.mutable_data<float>();
+  float* osz_cpu_data = osz_cpu.mutable_data<float>();
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data<float>();
+  float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data<float>();
+  float* input_scale_ref_data = input_scale_ref.mutable_data<float>();
+  float* osz_ref_data = osz_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+
+  osz_cpu_data[0] = out_h;
+  osz_cpu_data[1] = out_w;
+  size_tensor0_cpu_data[0] = out_h;
+  size_tensor1_cpu_data[0] = out_w;
+  input_scale_cpu_data[0] = scale;
+  osz_ref_data[0] = out_h;
+  osz_ref_data[1] = out_w;
+  size_tensor0_ref_data[0] = out_h;
+  size_tensor1_ref_data[0] = out_w;
+  input_scale_ref_data[0] = scale;
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  size_tensor[0].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor0_cpu_data, size_tensor[0].dims());
+  size_tensor[1].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor1_cpu_data, size_tensor[1].dims());
+  input_scale.Assign<float, lite::DDim, TARGET(kCUDA)>(input_scale_cpu_data,
+                                                       input_scale.dims());
+  osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
+
+  param.X = &x;
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[0]));
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[1]));
+  param.Scale = &input_scale;
+  param.OutSize = &osz;
+  param.Out = &out;
+
+  bilinear_interp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  bilinear_interp_kernel.SetContext(std::move(ctx));
+  bilinear_interp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  for (int i = 0; i < out.numel(); i++) {
+    LOG(INFO) << out_cpu_data[i];
+  }
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/calib_compute_cuda_test.cc b/lite/kernels/cuda/calib_compute_cuda_test.cc
index 8703d8730a1880b5b93502e5095b1a17d03bee6c..fdb47f7dd3c2e6d8f82e0281b81b24ebe444909a 100644
--- a/lite/kernels/cuda/calib_compute_cuda_test.cc
+++ b/lite/kernels/cuda/calib_compute_cuda_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/kernels/cuda/calib_compute.h"
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
@@ -58,12 +59,7 @@ void calib_ref(const operators::CalibParam& param, bool to_float = true) {
 }
 
 TEST(calib_cuda, int8_to_fp32) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(*std::next(kernels.begin(), 1));
-  LOG(INFO) << "get kernel: " << calib->doc();
+  CalibComputeInt8ToFp32 calib;
   const int n = 64, c = 32, h = 18, w = 18;
   Tensor x;
   Tensor x_cpu;
@@ -87,14 +83,14 @@ TEST(calib_cuda, int8_to_fp32) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
+  calib.SetContext(std::move(ctx));
 
   operators::CalibParam param;
   param.scale = 0.013f;
   param.input = &x;
   param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
+  calib.SetParam(param);
+  calib.Launch();
   cudaDeviceSynchronize();
   // invoking ref implementation and compare results
   param.input = &x_cpu;
@@ -113,12 +109,7 @@ TEST(calib_cuda, int8_to_fp32) {
 }
 
 TEST(calib_cuda, fp32_to_int8) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(kernels.front());
-  LOG(INFO) << "get kernel: " << calib->doc();
+  CalibComputeFp32ToInt8 calib;
   const int n = 64, c = 32, h = 18, w = 18;
   Tensor x;
   Tensor x_cpu;
@@ -142,14 +133,14 @@ TEST(calib_cuda, fp32_to_int8) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
+  calib.SetContext(std::move(ctx));
 
   operators::CalibParam param;
   param.scale = 0.013f;
   param.input = &x;
   param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
+  calib.SetParam(param);
+  calib.Launch();
   cudaDeviceSynchronize();
   // invoking ref implementation and compare results
   param.input = &x_cpu;
diff --git a/lite/kernels/cuda/concat_compute.cu b/lite/kernels/cuda/concat_compute.cu
index 9ec693667252e76a99f305bc3ec9d6062cc2e840..72d0af459b26b6864dd308fece7131224b212a05 100644
--- a/lite/kernels/cuda/concat_compute.cu
+++ b/lite/kernels/cuda/concat_compute.cu
@@ -51,9 +51,9 @@ void ConcatCompute<Dtype>::Run() {
   Tensor* output = param.output;
   auto* output_data = output->mutable_data<Dtype>(TARGET(kCUDA));
   int axis = param.axis;
-  auto* axis_tensor = param.axis_tensor;
+  Tensor* axis_tensor = param.axis_tensor;
   if (axis_tensor != nullptr) {
-    auto* axis_tensor_data = axis_tensor->data<int>();
+    const int* axis_tensor_data = axis_tensor->data<int>();
     axis = axis_tensor_data[0];
   }
   int inner_size = 1;
diff --git a/lite/kernels/cuda/conv_compute.cc b/lite/kernels/cuda/conv_compute.cc
index eea81602ddf94158250aecf01fe5e95193bf58c1..468ed0cbd06a1b20596cef9ba8a7f0998de7fe73 100644
--- a/lite/kernels/cuda/conv_compute.cc
+++ b/lite/kernels/cuda/conv_compute.cc
@@ -21,10 +21,14 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + pad_left + pad_right - dkernel) / stride + 1;
   CHECK_GT_OR_FALSE(output_size, 0);
 
   return output_size;
@@ -50,11 +54,15 @@ void ConvComputeInt8<Ptype_out>::PrepareForRun() {
   const auto filter_dims = param.filter->dims();
   std::vector<int64_t> output_shape({in_dims[0]});
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   for (size_t i = 0; i < param.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 1],
                                           filter_dims[i + 1],
-                                          param.dilations[i],
-                                          param.paddings[i],
+                                          dilations[i],
+                                          paddings[2 * i],
+                                          paddings[2 * i + 1],
                                           param.strides[i]));
   }
   output_shape.push_back(filter_dims[0]);
@@ -71,12 +79,15 @@ void ConvComputeInt8<Ptype_out>::Run() {
   const auto in_dims = param.x->dims();
   const auto filter_dims = param.filter->dims();
   std::vector<int64_t> output_shape({in_dims[0]});
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
 
   for (size_t i = 0; i < param.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 1],
                                           filter_dims[i + 1],
-                                          param.dilations[i],
-                                          param.paddings[i],
+                                          dilations[i],
+                                          paddings[2 * i],
+                                          paddings[2 * i + 1],
                                           param.strides[i]));
   }
   output_shape.push_back(filter_dims[0]);
diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc
index 05175a0debcd687a2e5e06fa799839ad52c50adb..2ebd7e33baf8e12cfce24661f186382152b6bb89 100644
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
@@ -41,7 +41,10 @@ TEST(conv_compute, fp32) {
   act_param.Leaky_relu_alpha = 0.1;
   operators::ConvParam param;
   param.activation_param = act_param;
-  param.paddings = {1, 1};
+  std::vector<int> pads = {1, 1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.groups = 1;
 
   Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu;
@@ -148,6 +151,10 @@ TEST(conv_compute, int8) {
   bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
                                                 filter_cpu.dims());
 
+  std::vector<int> pads = {0, 0, 0, 0};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.x = &x;
   param.filter = &filter;
   param.output = &y;
@@ -202,12 +209,10 @@ TEST(conv_compute, int8_int8_out) {
   std::cout << "input" << std::endl;
   for (int i = 0; i < x_cpu.numel(); i++) {
     x_cpu_data[i] = static_cast<int8_t>(random(-36, 36));
-    std::cout << float(x_cpu_data[i]) << std::endl;
   }
   std::cout << "filter" << std::endl;
   for (int i = 0; i < filter_cpu.numel(); i++) {
     filter_cpu_data[i] = static_cast<int8_t>(random(-10, 10));
-    std::cout << float(filter_cpu_data[i]) << std::endl;
   }
   for (int i = 0; i < bias_cpu.numel(); i++) {
     bias_cpu_data[i] = i + 1.0;
@@ -220,6 +225,10 @@ TEST(conv_compute, int8_int8_out) {
   bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
                                                 filter_cpu.dims());
 
+  std::vector<int> pads = {0, 0, 0, 0};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.x = &x;
   param.filter = &filter;
   param.output = &y;
diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64759f86f5df85f9855b9c1f186bbc9c039a044c
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute.cu
@@ -0,0 +1,318 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <map>
+#include <vector>
+#include "lite/backends/cuda/math/elementwise.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/elementwise_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+#define ELEMENTWISE_COMPUTE(OP, WITH_RELU)                           \
+  auto& param = this->Param<param_t>();                              \
+  auto& ctx = this->ctx_->template As<CUDAContext>();                \
+  auto stream = ctx.exec_stream();                                   \
+  const lite::Tensor* x = param.X;                                   \
+  const lite::Tensor* y = param.Y;                                   \
+  lite::Tensor* out = param.Out;                                     \
+  int axis = param.axis;                                             \
+  auto* x_data = x->data<float>();                                   \
+  auto* y_data = y->data<float>();                                   \
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));           \
+  int pixel_num = x->numel();                                        \
+  int pre = 1;                                                       \
+  int n = pixel_num;                                                 \
+  int post = 1;                                                      \
+  if (WITH_RELU) {                                                   \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  } else {                                                           \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  }
+
+#define ELEMENTWISE_COMPUTE_NHWC(OP, WITH_RELU)                      \
+  std::map<int, int> pos_map = {{0, 0}, {1, 3}, {2, 1}, {3, 2}};     \
+  auto& param = this->Param<param_t>();                              \
+  auto& ctx = this->ctx_->template As<CUDAContext>();                \
+  auto stream = ctx.exec_stream();                                   \
+  const lite::Tensor* x = param.X;                                   \
+  const lite::Tensor* y = param.Y;                                   \
+  lite::Tensor* out = param.Out;                                     \
+  int axis = param.axis;                                             \
+  if (axis < 0) axis = x->dims().size() - y->dims().size();          \
+  CHECK(axis >= 0) << "invalid axis of elementwise op";              \
+  axis = pos_map[axis];                                              \
+  auto* x_data = x->data<float>();                                   \
+  auto* y_data = y->data<float>();                                   \
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));           \
+  int pixel_num = x->numel();                                        \
+  int pre = 1;                                                       \
+  int n = pixel_num;                                                 \
+  int post = 1;                                                      \
+  if (WITH_RELU) {                                                   \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  } else {                                                           \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  }
+
+void ElementwiseAddCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddReluCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddReluComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulReluCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulReluComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseAddCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseMulComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseAddReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseAddReluComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseMulReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseMulReluComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..986a4db2272d9a6607090babd937747f861f49c7
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+class ElementwiseAddComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddComputeNHWC() = default;
+};
+
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulCompute() = default;
+};
+
+class ElementwiseMulComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulComputeNHWC() = default;
+};
+
+class ElementwiseAddReluCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddReluCompute() = default;
+};
+
+class ElementwiseAddReluComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddReluComputeNHWC() = default;
+};
+
+class ElementwiseMulReluCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulReluCompute() = default;
+};
+
+class ElementwiseMulReluComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulReluComputeNHWC() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_compute_test.cc b/lite/kernels/cuda/elementwise_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fd0b7754f2d3209137b5f4862dfe1e90279f3be
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute_test.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+static void ElementwiseAddRef(float* x, float* y, float* out, int num) {
+  for (int i = 0; i < num; ++i) {
+    out[i] = x[i] + y[i];
+  }
+}
+
+static void ElementwiseBroadcastRef(
+    float* x, float* y, float* out, int pre, int n, int post) {
+  for (int i = 0; i < pre * n * post; ++i) {
+    int idx = (i / post) % n;
+    out[i] = x[i] + y[idx];
+  }
+}
+
+TEST(elementwise_add, normal) {
+  ElementwiseAddCompute elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, c, h, w});
+  y.Resize({n, c, h, w});
+  out.Resize({n, c, h, w});
+  x_cpu.Resize({n, c, h, w});
+  y_cpu.Resize({n, c, h, w});
+  out_cpu.Resize({n, c, h, w});
+  x_ref.Resize({n, c, h, w});
+  y_ref.Resize({n, c, h, w});
+  out_ref.Resize({n, c, h, w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel());
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(elementwise_add, bias) {
+  ElementwiseAddCompute elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, c, h, w});
+  y.Resize({c, 1, 1});
+  out.Resize({n, c, h, w});
+  x_cpu.Resize({n, c, h, w});
+  y_cpu.Resize({c, 1, 1});
+  out_cpu.Resize({n, c, h, w});
+  x_ref.Resize({n, c, h, w});
+  y_ref.Resize({c, 1, 1});
+  out_ref.Resize({n, c, h, w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.axis = -1;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseBroadcastRef(x_ref_data, y_ref_data, out_ref_data, n, c, h * w);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(elementwise_add_nhwc, bias) {
+  ElementwiseAddComputeNHWC elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, h, w, c});
+  y.Resize({c, 1, 1});
+  out.Resize({n, h, w, c});
+  x_cpu.Resize({n, h, w, c});
+  y_cpu.Resize({c, 1, 1});
+  out_cpu.Resize({n, h, w, c});
+  x_ref.Resize({n, h, w, c});
+  y_ref.Resize({c, 1, 1});
+  out_ref.Resize({n, h, w, c});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.axis = -1;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseBroadcastRef(
+      x_ref_data, y_ref_data, out_ref_data, n * h * w, c, 1);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/feed_compute.cc b/lite/kernels/cuda/feed_compute.cc
index cffa8a573d9b12b52ae1448632a56e40cea35b95..e54c5b9b035ab63c1356343ec671f5e968fd479b 100644
--- a/lite/kernels/cuda/feed_compute.cc
+++ b/lite/kernels/cuda/feed_compute.cc
@@ -20,21 +20,22 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-void FeedCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename T, PrecisionType Ptype>
+void FeedCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
   VLOG(4) << "feed_list.size: " << param.feed_list->size();
   const lite::Tensor& feed_item = (*param.feed_list)[param.col];
 
   int num = static_cast<int>(feed_item.numel());
-  auto input = feed_item.data<float>();
+  auto input = feed_item.data<T>();
   param.out->Resize(feed_item.dims());
-  auto output = param.out->mutable_data<float>(TARGET(kCUDA));
+  auto output = param.out->template mutable_data<T>(TARGET(kCUDA));
   VLOG(4) << "col: " << param.col << " num:" << num;
 
   TargetW::MemcpyAsync(
-      output, input, num * sizeof(float), IoDirection::HtoD, stream);
+      output, input, num * sizeof(T), IoDirection::HtoD, stream);
 }
 
 }  // namespace cuda
@@ -42,8 +43,13 @@ void FeedCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    feed, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::FeedCompute, nchw)
+typedef paddle::lite::kernels::cuda::FeedCompute<float, PRECISION(kFloat)>
+    FeedFp32;
+
+typedef paddle::lite::kernels::cuda::FeedCompute<int64_t, PRECISION(kInt64)>
+    FeedInt64;
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNCHW, FeedFp32, nchw)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -54,8 +60,7 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    feed, kCUDA, kFloat, kNHWC, paddle::lite::kernels::cuda::FeedCompute, nhwc)
+REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNHWC, FeedFp32, nhwc)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -65,3 +70,25 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNCHW, FeedInt64, nchw)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNHWC, FeedInt64, nhwc)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/feed_compute.h b/lite/kernels/cuda/feed_compute.h
index 0510404b2b6ad6c50f69c847bf833afbcfe59b99..9c42dcc1ca847ccbd58c0a578a969c4d77ec1bf1 100644
--- a/lite/kernels/cuda/feed_compute.h
+++ b/lite/kernels/cuda/feed_compute.h
@@ -20,7 +20,8 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-class FeedCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename T, PrecisionType Ptype>
+class FeedCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  public:
   using param_t = operators::FeedParam;
   using TargetW = TargetWrapper<TARGET(kCUDA)>;
diff --git a/lite/kernels/cuda/layout_compute.cc b/lite/kernels/cuda/layout_compute.cc
index e2d0ae4f2ef10b29247a2f823988e8098aa33795..6b56d9e1de28cbec57b4b45aff1d1b237b1784b9 100644
--- a/lite/kernels/cuda/layout_compute.cc
+++ b/lite/kernels/cuda/layout_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/cuda/layout_compute.h"
+#include <vector>
 #include "lite/backends/cuda/math/transpose.h"
 #include "lite/core/op_registry.h"
 
@@ -21,11 +22,32 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
+inline DDim trim_singular_dims(const DDim& dims) {
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
 #define NCHWTONHWC(type)                                                  \
   auto& param = this->template Param<param_t>();                          \
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
+  DDim input_trim_dim = trim_singular_dims(input_dim);                    \
+  if (input_trim_dim.size() == 1) {                                       \
+    param.y->CopyDataFrom(*param.x);                                      \
+    return;                                                               \
+  }                                                                       \
   CHECK(input_dim.size() == 4)                                            \
       << "NCHW to NHWC should guarantee that the input dims should be 4"; \
   int n = input_dim[0];                                                   \
@@ -41,6 +63,11 @@ namespace cuda {
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
+  DDim input_trim_dim = trim_singular_dims(input_dim);                    \
+  if (input_trim_dim.size() == 1) {                                       \
+    param.y->CopyDataFrom(*param.x);                                      \
+    return;                                                               \
+  }                                                                       \
   CHECK(input_dim.size() == 4)                                            \
       << "NHWC to NCHW should guarantee that the input dims should be 4"; \
   int n = input_dim[0];                                                   \
diff --git a/lite/kernels/cuda/lookup_table_compute.cu b/lite/kernels/cuda/lookup_table_compute.cu
index 34b6de0e105f8f6dbf070b4ad41a9e6c7d2a06c8..3c3bb952cac01a6d1e296085dc357b9b3a03773a 100644
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
@@ -98,3 +98,14 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
     .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.cu b/lite/kernels/cuda/match_matrix_tensor_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f89b9c9578e54ec8e7de93541eaa51a9b1d17a97
--- /dev/null
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
@@ -0,0 +1,145 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/match_matrix_tensor_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+void MatchMatrixTensorCompute::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+void MatchMatrixTensorCompute::Run() {
+  CHECK(ctx_) << "running context should be set first";
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+
+  auto* x = param.x;
+  auto* w = param.w;
+  auto* y = param.y;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  int dim_in = x->dims()[1];
+
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+
+  auto* bottom_l_data = x->data<float>();
+  auto* bottom_r_data = y->data<float>();
+  auto* t_data = w->data<float>();
+  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
+  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kCUDA));
+
+  gemm_impl_->init(
+      false, false, x->dims()[0], dim_t * dim_in, dim_in, &context);
+  gemm_impl_->run(
+      1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context);
+
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    for (int t = 0; t < dim_t; t++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
+      const auto* l_t_data =
+          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
+      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
+
+      gemm_impl_->init(false,
+                       true,
+                       len_l,
+                       len_r,
+                       dim_in,
+                       dim_t * dim_in,
+                       dim_in,
+                       len_r,
+                       &context);
+      gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
+    }
+  }
+
+  int batch_size = x->lod()[0].size() - 1;
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(match_matrix_tensor,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::MatchMatrixTensorCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("W",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Tmp",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.h b/lite/kernels/cuda/match_matrix_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..09db326ff3e992363e9b572ca91444499caed20f
--- /dev/null
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~MatchMatrixTensorCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute_test.cc b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce0ae2a7a8b4e41a16da3b8ed7fce2eef30f4f76
--- /dev/null
+++ b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/match_matrix_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(match_matrix_tensor, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  MatchMatrixTensorCompute kernel;
+  operators::MatchMatrixTensorParam param;
+
+  // prepare ins and outs tensor in gpu, including size and lod
+  int ix = 5, iy = 4, h = 2, dim_t = 2;
+  Tensor x, w, y, out, tmp;
+  x.Resize({ix, h});
+  w.Resize({h, dim_t, h});
+  y.Resize({iy, h});
+  out.Resize({18, 1});
+  tmp.Resize({20, 1});
+  LoD x_lod{};
+  x_lod.push_back({0, 2, 5});
+  x.set_lod(x_lod);
+  LoD y_lod{};
+  y_lod.push_back({0, 3, 4});
+  y.set_lod(y_lod);
+
+  // init ins tensor in cpu
+  Tensor x_cpu, w_cpu, y_cpu, out_cpu, tmp_cpu;
+  x_cpu.Resize({ix, h});
+  w_cpu.Resize({h, dim_t, h});
+  y_cpu.Resize({iy, h});
+  out_cpu.Resize({18, 1});
+  tmp_cpu.Resize({20, 1});
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* w_cpu_data = w_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < w_cpu.numel(); ++i) {
+    w_cpu_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = static_cast<float>(i);
+  }
+
+  // cpu tensor data assigin to gpu tensor
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  w.Assign<float, lite::DDim, TARGET(kCUDA)>(w_cpu_data, w_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.x = &x;
+  param.w = &w;
+  param.y = &y;
+  param.dim_t = dim_t;
+  param.out = &out;
+  param.tmp = &tmp;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  std::vector<float> ref_results = {5,
+                                    23,
+                                    41,
+                                    17,
+                                    75,
+                                    133,
+                                    7,
+                                    33,
+                                    59,
+                                    27,
+                                    125,
+                                    223,
+                                    323,
+                                    455,
+                                    587,
+                                    557,
+                                    793,
+                                    1029};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/mul_compute_test.cc b/lite/kernels/cuda/mul_compute_test.cc
index d1c1d63e7dcd46f84cd128fc5b855da2098e179d..f521a12e2dddcf854b3982ae37f4da7631f6acf3 100644
--- a/lite/kernels/cuda/mul_compute_test.cc
+++ b/lite/kernels/cuda/mul_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include "lite/backends/cuda/blas.h"
 
 namespace paddle {
 namespace lite {
@@ -26,6 +27,7 @@ TEST(mul_compute, normal) {
   MulCompute mul_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
+  context.InitOnce();
 
   Tensor x, y, out, x_cpu, y_cpu, out_cpu;
   int x_h = 2, x_w_y_h = 3, y_w = 4;
diff --git a/lite/kernels/cuda/nearest_interp_compute.cu b/lite/kernels/cuda/nearest_interp_compute.cu
index 1a614e0656b417786deff8df6b7a827433b33f7b..adae034a1d68d723440c55ff3cc21430e1bc33b4 100644
--- a/lite/kernels/cuda/nearest_interp_compute.cu
+++ b/lite/kernels/cuda/nearest_interp_compute.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+#include "lite/backends/cuda/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/nearest_interp_compute.h"
 
@@ -20,6 +21,43 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    lite::Tensor temp;
+    auto temp_data = temp.mutable_data<float>();
+    auto tensor_data = tensor->data<float>();
+    cudaMemcpy(temp_data,
+               tensor_data,
+               tensor->dims().production() * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    vec_new_shape.push_back(static_cast<int32_t>(*temp_data));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data<T>();
+  cudaMemcpy(cpu_starts_tensor_data,
+             new_data,
+             new_data_tensor->dims().production() * sizeof(T),
+             cudaMemcpyDeviceToHost);
+
+  auto new_data_ = cpu_starts_tensor.data<T>();
+  vec_new_data = std::vector<T>(
+      new_data_, new_data_ + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 __global__ void KeNearestNeighborInterp(const float* in,
                                         const size_t in_img_h,
                                         const size_t in_img_w,
@@ -79,19 +117,34 @@ void NearestInterpCompute::Run() {
   int out_w = param.out_w;
   float scale = param.scale;
   bool align_corners = param.align_corners;
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  if (out_size != nullptr) {
-    Tensor sizes;
-    float* size_data = sizes.mutable_data<float>();
-    float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
-    cudaMemcpy(
-        size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
-    out_h = static_cast<int>(size_data[0]);
-    out_w = static_cast<int>(size_data[1]);
+  auto align_mode = param.align_mode;
+
+  auto list_new_shape_tensor = param.SizeTensor;
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    auto scale_tensor = param.Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+
+    if (out_size != nullptr) {
+      lite::Tensor sizes;
+      float* size_data = sizes.mutable_data<float>();
+      float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
+      cudaMemcpy(
+          size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+      out_h = static_cast<int>(size_data[0]);
+      out_w = static_cast<int>(size_data[1]);
+    }
   }
 
   auto output_data = output->mutable_data<float>(TARGET(kCUDA));
@@ -162,6 +215,14 @@ REGISTER_LITE_KERNEL(nearest_interp,
                {LiteType::GetTensorTy(TARGET(kCUDA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scale",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kCUDA),
                                        PRECISION(kFloat),
diff --git a/lite/kernels/cuda/nearest_interp_compute_test.cc b/lite/kernels/cuda/nearest_interp_compute_test.cc
index 85032016d630f11bbfe150f750470e89e241c61b..ad2ef9294e0de06a9dfdd141b8001bb34c6d1fb9 100644
--- a/lite/kernels/cuda/nearest_interp_compute_test.cc
+++ b/lite/kernels/cuda/nearest_interp_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace paddle {
 namespace lite {
@@ -143,6 +144,116 @@ TEST(nearest_interp, normal) {
   }
 }
 
+TEST(nearest_interp, update) {
+  NearestInterpCompute nearest_interp_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::InterpolateParam param;
+
+  std::vector<Tensor> size_tensor(2);
+  std::vector<Tensor> size_tensor_cpu(2), size_tensor_ref(2);
+  Tensor x, input_scale, osz, out;
+  Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu;
+  Tensor x_ref, input_scale_ref, osz_ref, out_ref;
+
+  int n = 1, c = 3, in_h = 40, in_w = 40;
+  int out_h = 80, out_w = 80;
+  float scale = 2.0;
+
+  param.out_h = out_h;
+  param.out_w = out_w;
+  param.scale = scale;
+  param.align_corners = false;
+  param.align_mode = 0;
+
+  x.Resize({n, c, in_h, in_w});
+  size_tensor[0].Resize({1});
+  size_tensor[1].Resize({1});
+  input_scale.Resize({1});
+  osz.Resize({2});
+  out.Resize({n, c, out_h, out_w});
+
+  x_cpu.Resize({n, c, in_h, in_w});
+  size_tensor_cpu[0].Resize({1});
+  size_tensor_cpu[1].Resize({1});
+  input_scale_cpu.Resize({1});
+  osz_cpu.Resize({2});
+  out_cpu.Resize({n, c, out_h, out_w});
+
+  x_ref.Resize({n, c, in_h, in_w});
+  size_tensor_ref[0].Resize({1});
+  size_tensor_ref[1].Resize({1});
+  input_scale_ref.Resize({1});
+  osz_ref.Resize({2});
+  out_ref.Resize({n, c, out_h, out_w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data<float>();
+  float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data<float>();
+  float* input_scale_cpu_data = input_scale_cpu.mutable_data<float>();
+  float* osz_cpu_data = osz_cpu.mutable_data<float>();
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data<float>();
+  float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data<float>();
+  float* input_scale_ref_data = input_scale_ref.mutable_data<float>();
+  float* osz_ref_data = osz_ref.mutable_data<float>();
+  float* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  osz_cpu_data[0] = out_h;
+  osz_cpu_data[1] = out_w;
+  size_tensor0_cpu_data[0] = out_h;
+  size_tensor1_cpu_data[0] = out_w;
+  input_scale_cpu_data[0] = scale;
+  osz_ref_data[0] = out_h;
+  osz_ref_data[1] = out_w;
+  size_tensor0_ref_data[0] = out_h;
+  size_tensor1_ref_data[0] = out_w;
+  input_scale_ref_data[0] = scale;
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  size_tensor[0].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor0_cpu_data, size_tensor[0].dims());
+  size_tensor[1].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor1_cpu_data, size_tensor[1].dims());
+  input_scale.Assign<float, lite::DDim, TARGET(kCUDA)>(input_scale_cpu_data,
+                                                       input_scale.dims());
+  osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
+
+  param.X = &x;
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[0]));
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[1]));
+  param.Scale = &input_scale;
+  param.OutSize = &osz;
+  param.Out = &out;
+  nearest_interp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  nearest_interp_kernel.SetContext(std::move(ctx));
+  nearest_interp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  NearestInterpRef(&x_ref, &out_ref, false);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/pool_compute.cu b/lite/kernels/cuda/pool_compute.cu
index a2483a2c759e8acc5f5944fd316c83bb49530d36..d7e3739ddbb59a624e1911b8178e96053dacc0d1 100644
--- a/lite/kernels/cuda/pool_compute.cu
+++ b/lite/kernels/cuda/pool_compute.cu
@@ -256,6 +256,7 @@ void PoolCompute::Run() {
   bool adaptive = param.adaptive;
   auto x_dims = param.x->dims();
   auto out_dims = param.output->dims();
+  auto paddings = *param.paddings;
   const int in_h = x_dims[2];
   const int in_w = x_dims[3];
   const int out_h = out_dims[2];
@@ -266,8 +267,8 @@ void PoolCompute::Run() {
   const int win_w = param.ksize[1];
   const int stride_h = param.strides[0];
   const int stride_w = param.strides[1];
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   const int total_threads = out_dims.production();
   const int threads = 512;
   const int blocks = (total_threads + threads - 1) / threads;
@@ -357,6 +358,61 @@ void PoolCompute::Run() {
   if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error);
 }
 
+inline int PoolOutputSize(
+    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
+  return output_size;
+}
+
+void PoolComputeNHWC::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  pool_impl_.reset(new lite::cuda::math::CudnnPool2DNHWC<PRECISION(kFloat)>);
+  pool_impl_->init(param, &ctx);
+}
+
+void PoolComputeNHWC::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  const auto x_dims = param.x->dims();
+  std::vector<int>& ksize = param.ksize;
+  if (param.global_pooling) {
+    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      (*param.paddings)[i] = 0;
+      ksize[i] = static_cast<int>(x_dims[i + 1]);
+    }
+  }
+
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (param.adaptive) {
+    output_shape.insert(
+        output_shape.end(), param.ksize.begin(), param.ksize.end());
+  } else {
+    for (size_t i = 0; i < param.ksize.size(); ++i) {
+      output_shape.push_back(PoolOutputSize(x_dims[i + 1],
+                                            param.ksize[i],
+                                            (*param.paddings)[i],
+                                            param.strides[i],
+                                            param.ceil_mode));
+    }
+  }
+  output_shape.push_back(x_dims[3]);
+  param.output->Resize(lite::DDim(output_shape));
+
+  pool_impl_->run(param);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error);
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
@@ -373,3 +429,19 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::PoolComputeNHWC,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/pool_compute.h b/lite/kernels/cuda/pool_compute.h
index 55b346bfaf4ac139c8d22bff2ac64f0e78bc6023..5c3a1bc2b93d3a03a40515fff6f14e604a11c0a1 100644
--- a/lite/kernels/cuda/pool_compute.h
+++ b/lite/kernels/cuda/pool_compute.h
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
+#include <vector>
+#include "lite/backends/cuda/math/cudnn_pool.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -29,6 +32,20 @@ class PoolCompute
   virtual ~PoolCompute() = default;
 };
 
+class PoolComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~PoolComputeNHWC() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::CudnnPool2DNHWC<PRECISION(kFloat)>>
+      pool_impl_;
+};
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc
index fe6ff92c0ce943cad36fbdd4f1408e344d9fd5fd..0e5aeec8c0133f1f61b469437e3e9a602096133f 100644
--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
@@ -27,42 +27,123 @@ namespace cuda {
 using Tensor = lite::Tensor;
 using DDim = lite::DDim;
 
-static int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+
+template <typename Dtype>
+void nchw2nhwc_ref(lite::Tensor* input, lite::Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+
+#undef IN
+#undef OUT
+
+#define IN(n, h, w, c)                                 \
+  input_data[c + w * input_c + h * input_w * input_c + \
+             n * input_h * input_w * input_c]
+#define OUT(n, h, w, c)                                    \
+  output_data[c + w * output_c + h * output_w * output_c + \
+              n * output_h * output_w * output_c]
+
+template <typename Dtype>
+void nhwc2nchw_ref(lite::Tensor* input, lite::Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_h = input->dims()[1];
+  int input_w = input->dims()[2];
+  int input_c = input->dims()[3];
+  int output_h = output->dims()[1];
+  int output_w = output->dims()[2];
+  int output_c = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, c, h, w) = IN(n, h, w, c);
+        }
+      }
+    }
+  }
+}
+
+static int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int pad_left,
+                          int pad_right,
+                          int stride,
+                          bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
 
-static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
+static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_,
+                                                 bool is_nchw) {
+  int axis = 2;
+  if (!is_nchw) axis = 1;
   const auto x_dims = param_->x->dims();
   std::vector<int>& ksize = param_->ksize;
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    auto paddings = *param_->paddings;
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
 
-  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (is_nchw) output_shape.push_back(x_dims[1]);
   if (param_->adaptive) {
     output_shape.insert(
         output_shape.end(), param_->ksize.begin(), param_->ksize.end());
   } else {
+    auto paddings = *param_->paddings;
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
+      output_shape.push_back(PoolOutputSize(x_dims[i + axis],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
   }
+  if (!is_nchw) output_shape.push_back(x_dims[3]);
   return output_shape;
 }
 
@@ -75,7 +156,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -99,7 +180,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -195,15 +276,15 @@ TEST(pool_cuda, compute) {
               for (auto pad : {0, 1}) {
                 for (auto n : {1, 2}) {
                   for (auto c : {1, 3}) {
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                    for (auto h : {3}) {
+                      for (auto w : {3}) {
+                        LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                                  << " w:" << w << " ksize:" << ksize
+                                  << " stride:" << stride << " pad:" << pad
+                                  << " exclusive:" << exclusive
+                                  << " global_pooling:" << global_pooling
+                                  << " ceil_mode: " << ceil_mode
+                                  << " pooling_type:" << pooling_type;
 
                         // init x, output
                         x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
@@ -226,14 +307,16 @@ TEST(pool_cuda, compute) {
                         }
                         param.global_pooling = global_pooling;
                         param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
                         param.exclusive = exclusive;
                         param.ceil_mode = ceil_mode;
                         param.adaptive = false;
                         param.use_quantizer = false;
 
                         const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
+                            compute_output_shape(&param, true);
                         if (output_shape[2] * output_shape[3] == 0) continue;
                         output.Resize(DDim(output_shape));
                         output_ref.Resize(DDim(output_shape));
@@ -277,6 +360,131 @@ TEST(pool_cuda, compute) {
     }
   }
 }
+
+TEST(pool_cuda, nhwc) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  PoolComputeNHWC pool;
+  operators::PoolParam param;
+  pool.SetContext(std::move(ctx));
+
+  lite::Tensor x, temp;
+  lite::Tensor x_cpu;
+  lite::Tensor output;
+  lite::Tensor output_cpu, output_temp;
+  lite::Tensor output_ref;
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ceil_mode : {false}) {
+      for (auto global_pooling : {true, false}) {
+        for (auto exclusive : {false, true}) {
+          for (auto ksize : {3}) {
+            for (auto stride : {3}) {
+              for (auto pad : {1}) {
+                for (auto n : {1}) {
+                  for (auto c : {3}) {
+                    for (auto h : {8}) {
+                      for (auto w : {8}) {
+                        LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                                  << " w:" << w << " ksize:" << ksize
+                                  << " stride:" << stride << " pad:" << pad
+                                  << " exclusive:" << exclusive
+                                  << " global_pooling:" << global_pooling
+                                  << " ceil_mode: " << ceil_mode
+                                  << " pooling_type:" << pooling_type;
+
+                        // init x, output
+                        x.Resize(DDim(std::vector<int64_t>({n, h, w, c})));
+                        temp.Resize(DDim(std::vector<int64_t>({n, h, w, c})));
+                        x_cpu.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+
+                        auto* x_cpu_data = x_cpu.mutable_data<float>();
+                        for (int i = 0; i < x_cpu.dims().production(); ++i) {
+                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                          x_cpu_data[i] = sign * (i % 128);
+                        }
+
+                        nchw2nhwc_ref<float>(&x_cpu, &temp);
+                        auto* temp_cpu_data = temp.mutable_data<float>();
+
+                        x.Assign<float, DDim, TARGET(kCUDA)>(temp_cpu_data,
+                                                             temp.dims());
+                        // fill param
+                        param.x = &x;
+                        param.output = &output;
+                        param.pooling_type = pooling_type;
+                        if (global_pooling) {
+                          param.ksize = {h, w};
+                        } else {
+                          param.ksize = {ksize, ksize};
+                        }
+                        param.global_pooling = global_pooling;
+                        param.strides = {stride, stride};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
+                        param.exclusive = exclusive;
+                        param.ceil_mode = ceil_mode;
+                        param.adaptive = false;
+                        param.use_quantizer = false;
+
+                        const std::vector<int64_t>& output_shape =
+                            compute_output_shape(&param, false);
+                        if (output_shape[2] * output_shape[3] == 0) continue;
+                        output.Resize(DDim(output_shape));
+                        output_temp.Resize(DDim(output_shape));
+                        output_cpu.Resize(DDim(output_shape));
+
+                        auto* output_data =
+                            output.mutable_data<float>(TARGET(kCUDA));
+                        auto* output_cpu_data =
+                            output_cpu.mutable_data<float>();
+
+                        // compute
+                        pool.SetParam(param);
+                        pool.Launch();
+
+                        // compute ref
+                        param.x = &x_cpu;
+                        // nchw
+                        const std::vector<int64_t>& output_shape_ref =
+                            compute_output_shape(&param, true);
+
+                        output_ref.Resize(DDim(output_shape_ref));
+                        // auto* output_ref_data =
+                        //    output_ref.mutable_data<float>();
+                        param.output = &output_ref;
+                        pool_compute_ref(param);
+                        nchw2nhwc_ref<float>(&output_ref, &output_temp);
+                        auto* output_temp_data =
+                            output_temp.mutable_data<float>();
+
+                        cudaDeviceSynchronize();
+                        CopySync<TARGET(kCUDA)>(output_cpu_data,
+                                                output_data,
+                                                sizeof(float) * output.numel(),
+                                                IoDirection::DtoH);
+                        // compare
+                        for (int i = 0; i < output.dims().production(); i++) {
+                          EXPECT_NEAR(
+                              output_cpu_data[i], output_temp_data[i], 1e-4);
+                        }
+                        VLOG(3) << "compare pass";
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ddefb608dd233279b4a8127b100151acf8ffc8e6
--- /dev/null
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_aligned_mat_mul,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchAlignedMatMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.h b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1c4552d9c43e2dcbc3bf0211f7028811410cb6c
--- /dev/null
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/math/batched_gemm.h"
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/types.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchAlignedMatMulCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void PrepareForRun() override {
+    auto& param = this->Param<param_t>();
+    CHECK(ctx_) << "running context should be set first";
+    auto& cuda_ctx = ctx_->template As<CUDAContext>();
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    int seq_num = param.X->lod()[0].size() - 1;
+    batched_gemm_impl_.reset(new lite::cuda::math::BatchedGemm<float, float>);
+    CHECK(
+        batched_gemm_impl_->init(x_transpose, y_transpose, seq_num, &cuda_ctx));
+    A_ = static_cast<float**>(malloc(3 * seq_num * sizeof(float*)));
+    CHECK(A_);
+  }
+
+  void Run() override {
+    auto& param = this->Param<param_t>();
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    float alpha = param.alpha;
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose ? x_inner_size : x_batch_size;
+    int N = y_transpose ? y_batch_size : y_inner_size;
+    int X_K = x_transpose ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+
+    auto x_data = x->data<float>();
+    auto y_data = y->data<float>();
+    auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+    auto x_stride = x_batch_size * x_inner_size;
+    auto y_stride = y_batch_size * y_inner_size;
+    auto out_stride = M * N;
+    for (int seq = 0; seq < seq_num; seq++) {
+      A_[seq] = const_cast<float*>(x_data) + seq * x_stride;
+      A_[seq + seq_num] = const_cast<float*>(y_data) + seq * y_stride;
+      A_[seq + seq_num * 2] = out_data + seq * out_stride;
+    }
+    batched_gemm_impl_->run(
+        alpha, 0.0f, const_cast<const float**>(A_), M, N, K, seq_num);
+  }
+
+  ~SearchAlignedMatMulCompute() {
+    if (A_ != nullptr) {
+      free(A_);
+    }
+  }
+
+ private:
+  std::unique_ptr<lite::cuda::math::BatchedGemm<float, float>>
+      batched_gemm_impl_;
+  float** A_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f08333b3103973f99d37e39e7e7babeb52b335f1
--- /dev/null
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+void search_aligned_mat_mul_compute_ref(const operators::MatMulParam& param) {
+  auto x = param.X;
+  auto y = param.Y;
+  auto out = param.Out;
+  bool x_transpose = param.transpose_X;
+  bool y_transpose = param.transpose_Y;
+  T alpha = static_cast<T>(param.alpha);
+  const auto x_dims = x->dims();
+  const auto y_dims = y->dims();
+  const auto& x_lod = x->lod();
+  const auto& y_lod = y->lod();
+  const auto& x_lod_0 = x_lod[0];
+  const auto& y_lod_0 = y_lod[0];
+  int seq_num = x_lod_0.size() - 1;
+  int x_inner_size = x_dims[1];
+  int y_inner_size = y_dims[1];
+  int x_batch_size = x_lod_0[1];
+  int y_batch_size = y_lod_0[1];
+  int M = x_transpose ? x_inner_size : x_batch_size;
+  int N = y_transpose ? y_batch_size : y_inner_size;
+  int X_K = x_transpose ? x_batch_size : x_inner_size;
+  int Y_K = y_transpose ? y_inner_size : y_batch_size;
+  CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+  int K = X_K;
+  int lda = x_transpose ? M : K;
+  int ldb = y_transpose ? K : N;
+  int ldc = N;
+  int x_stride = x_batch_size * x_inner_size;
+  int y_stride = y_batch_size * y_inner_size;
+  int out_stride = M * N;
+  auto x_data = x->data<T>();
+  auto y_data = y->data<T>();
+  auto out_data = out->mutable_data<T>();
+
+  for (int seq = 0; seq < seq_num; seq++) {
+    auto a = x_data + seq * x_stride;
+    auto b = y_data + seq * y_stride;
+    auto c = out_data + seq * out_stride;
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        auto sum = static_cast<T>(0);
+        for (int l = 0; l < K; l++) {
+          T av;
+          T bv;
+          if (x_transpose) {
+            av = a[l * lda + i];
+          } else {
+            av = a[i * lda + l];
+          }
+          if (y_transpose) {
+            bv = b[j * ldb + l];
+          } else {
+            bv = b[l * ldb + j];
+          }
+          sum += av * bv;
+        }
+        c[i * ldc + j] = alpha * sum;
+      }
+    }
+  }
+}
+
+TEST(search_aligned_mat_mul_compute, normal) {
+  Env<TargetType::kCUDA>::Init();
+  for (int seq_num : {1, 2}) {
+    for (int x_batch_size : {1, 3}) {
+      for (int x_inner_size : {1, 5}) {
+        for (int out_inner_size : {1, 4}) {
+          for (bool x_transpose : {true, false}) {
+            for (bool y_transpose : {true, false}) {
+              for (float alpha : {1., 2.}) {
+                // infer x_dims and y_dims
+                int y_batch_size;
+                int y_inner_size;
+                int out_batch_size;
+                if (x_transpose) {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_batch_size;
+                    out_batch_size = x_inner_size;
+                  } else {
+                    y_batch_size = x_batch_size;
+                    y_inner_size = out_inner_size;
+                    out_batch_size = x_inner_size;
+                  }
+                } else {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_inner_size;
+                    out_batch_size = x_batch_size;
+                  } else {
+                    y_batch_size = x_inner_size;
+                    y_inner_size = out_inner_size;
+                    out_batch_size = x_batch_size;
+                  }
+                }
+                std::vector<uint64_t> x_lod_0(seq_num + 1);
+                std::vector<uint64_t> y_lod_0(seq_num + 1);
+                std::vector<uint64_t> out_lod_0(seq_num + 1);
+                x_lod_0[0] = 0;
+                y_lod_0[0] = 0;
+                out_lod_0[0] = 0;
+                for (int i = 0; i < seq_num; i++) {
+                  x_lod_0[i + 1] = x_lod_0[i] + x_batch_size;
+                  y_lod_0[i + 1] = y_lod_0[i] + y_batch_size;
+                  out_lod_0[i + 1] = out_lod_0[i] + out_batch_size;
+                }
+                LoD x_lod;
+                LoD y_lod;
+                LoD out_lod;
+                x_lod.push_back(x_lod_0);
+                y_lod.push_back(y_lod_0);
+                out_lod.push_back(out_lod_0);
+                DDim x_dims({static_cast<int64_t>(x_lod_0.back()),
+                             static_cast<int64_t>(x_inner_size)});
+                DDim y_dims({static_cast<int64_t>(y_lod_0.back()),
+                             static_cast<int64_t>(y_inner_size)});
+                DDim out_dims({static_cast<int64_t>(out_lod_0.back()),
+                               static_cast<int64_t>(out_inner_size)});
+                // prepare input&output tensors
+                Tensor x_dev, x_host, y_dev, y_host, out_dev, out_host, out_ref;
+                x_host.Resize(x_dims);
+                y_host.Resize(y_dims);
+                out_host.Resize(out_dims);
+                x_dev.Resize(x_dims);
+                y_dev.Resize(y_dims);
+                out_dev.Resize(out_dims);
+                out_ref.Resize(out_dims);
+                x_host.set_lod(x_lod);
+                y_host.set_lod(y_lod);
+                out_host.set_lod(out_lod);
+                x_dev.set_lod(x_lod);
+                y_dev.set_lod(y_lod);
+                out_dev.set_lod(out_lod);
+                out_ref.set_lod(out_lod);
+                auto out_dev_data = out_dev.mutable_data<float>(TARGET(kCUDA));
+                auto x_host_data = x_host.mutable_data<float>();
+                auto y_host_data = y_host.mutable_data<float>();
+                auto out_host_data = out_host.mutable_data<float>();
+                auto out_ref_data = out_ref.mutable_data<float>();
+                for (int i = 0; i < x_host.dims().production(); i++) {
+                  x_host_data[i] = i * 0.125f;
+                }
+                for (int i = 0; i < y_host.dims().production(); i++) {
+                  y_host_data[i] = i * 0.5f;
+                }
+                x_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(x_host_data,
+                                                               x_host.dims());
+                y_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(y_host_data,
+                                                               y_host.dims());
+                // prepare cuda context, initialize param, and run kernel
+                operators::MatMulParam param;
+                param.X = &x_dev;
+                param.Y = &y_dev;
+                param.Out = &out_dev;
+                param.alpha = alpha;
+                param.transpose_X = x_transpose;
+                param.transpose_Y = y_transpose;
+                std::unique_ptr<KernelContext> ctx(new KernelContext);
+                auto& cuda_ctx = ctx->As<CUDAContext>();
+                cuda_ctx.InitOnce();
+                int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+                cuda_ctx.Init(dev_id);
+                SearchAlignedMatMulCompute search_aligned_mat_mul;
+                search_aligned_mat_mul.SetParam(param);
+                search_aligned_mat_mul.SetContext(std::move(ctx));
+                search_aligned_mat_mul.Launch();
+                cudaDeviceSynchronize();
+                CopySync<TARGET(kCUDA)>(
+                    out_host_data,
+                    out_dev_data,
+                    sizeof(float) * out_dev.dims().production(),
+                    IoDirection::DtoH);
+                // run reference
+                param.X = &x_host;
+                param.Y = &y_host;
+                param.Out = &out_ref;
+                search_aligned_mat_mul_compute_ref<float>(param);
+                // verify result
+                for (int i = 0; i < out_ref.dims().production(); i++) {
+                  EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_fc_compute.cu b/lite/kernels/cuda/search_fc_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..591e2474a475590e8c7d3882b4dfa8f5a55a3ab0
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute.cu
@@ -0,0 +1,170 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_fc_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+template <typename T>
+static void anakin_NV_gemv(cublasHandle_t handle,
+                           const bool TransA,
+                           const int M,
+                           const int N,
+                           const T alpha,
+                           const T* A,
+                           const T* x,
+                           const T beta,
+                           T* y);
+template <>
+void anakin_NV_gemv<float>(cublasHandle_t handle,
+                           const bool TransA,
+                           const int M,
+                           const int N,
+                           const float alpha,
+                           const float* A,
+                           const float* x,
+                           const float beta,
+                           float* y) {
+  cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+      cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+}
+template <typename T>
+static void anakin_NV_gemm(cublasHandle_t handle,
+                           const bool TransA,
+                           const bool TransB,
+                           const int M,
+                           const int N,
+                           const int K,
+                           const T alpha,
+                           const T* A,
+                           const T* B,
+                           const T beta,
+                           T* C);
+
+template <>
+void anakin_NV_gemm<float>(cublasHandle_t handle,
+                           const bool TransA,
+                           const bool TransB,
+                           const int M,
+                           const int N,
+                           const int K,
+                           const float alpha,
+                           const float* A,
+                           const float* B,
+                           const float beta,
+                           float* C) {
+  // Note that cublas follows fortran order.
+  int lda = (!TransA /* == CblasNoTrans*/) ? K : M;
+  int ldb = (!TransB /* == CblasNoTrans*/) ? N : K;
+  cublasOperation_t cuTransA =
+      (!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(cublasSgemm(handle,
+                           cuTransB,
+                           cuTransA,
+                           N,
+                           M,
+                           K,
+                           &alpha,
+                           B,
+                           ldb,
+                           A,
+                           lda,
+                           &beta,
+                           C,
+                           N));
+}
+
+template <>
+void anakin_NV_gemm<char>(cublasHandle_t handle,
+                          const bool TransA,
+                          const bool TransB,
+                          const int M,
+                          const int N,
+                          const int K,
+                          const char alpha,
+                          const char* A,
+                          const char* B,
+                          const char beta,
+                          char* C) {
+  LOG(FATAL) << "int8 gemm is not implemented";
+}
+
+template <typename T>
+static __global__ void add_bias(int n,
+                                int output_size,
+                                const T* bias,
+                                T* dout) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int bias_index = index % output_size;
+  if (index < n) {
+    dout[index] = dout[index] + bias[bias_index];
+  }
+}
+
+template <typename T>
+void SearchFcCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  const Tensor* x_tensor = param.X;
+  param.Out->Resize({x_tensor->dims()[0], param.out_size});
+  _M = x_tensor->dims().count(0, 1);
+  _K = x_tensor->dims().count(1, x_tensor->numel());
+  _N = param.out_size;
+  const T* din = x_tensor->data<T>();
+  Tensor* out_tensor = param.Out;
+  T* dout = out_tensor->mutable_data<T>(TARGET(kCUDA));
+  const Tensor* w_tensor = param.W;
+  const T* weight = w_tensor->data<T>();
+  const Tensor* b_tensor = param.b;
+  const T* bias = b_tensor->data<T>();
+  cublasCreate(&_handle);
+  if (_M == 1 && _K > 50000) {
+    anakin_NV_gemv<T>(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout);
+  } else {
+    anakin_NV_gemm<T>(_handle,
+                      false,
+                      !_flag_trans_weights,
+                      _M,
+                      _N,
+                      _K,
+                      (T)1,
+                      din,
+                      weight,
+                      (T)0,
+                      dout);
+  }
+  int total_size = _M * _N;
+  add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>(
+      total_size, _N, bias, dout);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_fc_compute.h b/lite/kernels/cuda/search_fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..db09362734ecdb05663a5a6d4297ab869cb1b55d
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+inline int CUDA_GET_BLOCKS(const int N, const int base) {
+  return (N + base - 1) / base;
+}
+
+template <typename T>
+class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+  void Run() override;
+  virtual ~SearchFcCompute() = default;
+
+ private:
+  bool _flag_trans_weights{false};
+  int _M;
+  int _K;
+  int _N;
+  cublasHandle_t _handle;
+  bool _is_continue_buf{true};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_fc_compute_test.cc b/lite/kernels/cuda/search_fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f06028fbe15557c652c442ac436fa09700a56e28
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute_test.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_fc_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void fc_cpu_base(const lite::Tensor* X,
+                 const lite::Tensor* W,
+                 const lite::Tensor* b,
+                 int out_size,
+                 lite::Tensor* Out) {
+  const float* data_in = X->data<float>();
+  const float* bias = b->data<float>();
+  const float* weights = W->data<float>();
+  float* data_out = Out->mutable_data<float>();
+  int out_rows = X->dims()[0];
+  int in_cols = X->numel() / out_rows;
+  int out_cols = W->numel() / in_cols;
+  int index_out;
+
+  for (int i = 0; i < out_rows; i++) {
+    for (int j = 0; j < out_cols; j++) {
+      index_out = i * out_cols + j;
+      data_out[index_out] = bias ? bias[j] : 0;
+
+      for (int k = 0; k < in_cols; k++) {
+        data_out[index_out] +=
+            data_in[i * in_cols + k] * weights[j * in_cols + k];
+      }
+    }
+  }
+}
+
+TEST(search_fc, normal) {
+  SearchFcCompute<float> search_fc_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  operators::SearchFcParam param;
+  lite::Tensor X, X_gpu, W, W_gpu, b, b_gpu;
+  lite::Tensor Out, Out_cpu, out_ref;
+  std::vector<int64_t> x_shape{1, 4};
+  X.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  W.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{3};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  Out.Resize(lite::DDim(out_shape));
+  out_ref.Resize(lite::DDim(out_shape));
+  auto x_data = X.mutable_data<float>();
+  auto w_data = W.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data_ref = out_ref.mutable_data<float>();
+  for (int64_t i = 0; i < X.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < W.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+  X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(x_data, X.dims());
+  W_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(w_data, W.dims());
+  b_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(b_data, b.dims());
+  param.X = &X_gpu;
+  param.W = &W_gpu;
+  param.b = &b_gpu;
+  param.out_size = 4;
+  param.Out = &Out;
+  search_fc_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  search_fc_kernel.SetContext(std::move(ctx));
+  search_fc_kernel.Run();
+  fc_cpu_base(&X, &W, &b, 4, &out_ref);
+  cudaDeviceSynchronize();
+  const float* out_data = Out.data<float>();
+  float* out_cpu_data = Out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH);
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_data[i], out_data_ref[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_grnn_compute.cu b/lite/kernels/cuda/search_grnn_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..468b66e5680c7d0e5879def9a888e10faa0bca32
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute.cu
@@ -0,0 +1,351 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_grnn_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+template <typename T>
+T sigmoid(T z) {
+  return 1 / (1 + std::exp(-z));
+}
+
+template <typename T>
+__global__ void PreComputeKernel(
+    const int num, const T* w_x_e, const T* wz_x_e, T* tilde, T* z, T* hidden) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    tilde[index] = std::tanh(w_x_e[index]);
+    z[index] = 1 / (1 + std::exp(-wz_x_e[index]));
+    hidden[index] = (1. - z[index]) * tilde[index];
+  }
+}
+
+template <typename T>
+__global__ void PostComputeKernel(const int start,
+                                  const int end,
+                                  const int cap_h,
+                                  const int w_tm1,
+                                  const T* wr_x_e,
+                                  const T* ur_x_h,
+                                  const T* wz_x_e,
+                                  const T* uz_x_h,
+                                  const T* w_x_e,
+                                  const T* u_x_h,
+                                  T* r,
+                                  T* z,
+                                  T* tilde,
+                                  T* hidden) {
+  int j = start + blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < end) {
+    r[j] = 1 / (1 + std::exp(-(wr_x_e[j] + ur_x_h[j])));
+    z[j] = 1 / (1 + std::exp(-(wz_x_e[j] + uz_x_h[j])));
+    tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
+    hidden[j] = z[j] * hidden[j - cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+  }
+}
+
+void SearchGrnnCompute::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = context.exec_stream();
+
+  auto* _input = input_blob;
+  int dim0 = _input->dims()[0];
+  int dim1 = 1;
+  if (_input->dims().size() > 1) {
+    dim1 = _input->dims()[1];
+  }
+  int batch = _input->lod()[0].size() - 1;
+  auto& offset = _input->lod()[0];
+
+  idx_sorted_by_width_cpu = std::make_shared<Tensor>();
+  idx_sorted_by_width_cpu->Resize({batch});
+  int* idx_sorted_by_width_cpu_data =
+      idx_sorted_by_width_cpu->mutable_data<int>();
+
+  Tensor _width;
+  _width.Resize({batch});
+  int* width_data = _width.mutable_data<int>();
+  // sort sequence by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width_data[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_cpu_data[i] = i;
+  }
+  std::sort(idx_sorted_by_width_cpu_data,
+            idx_sorted_by_width_cpu_data + batch,
+            [&_width](int a, int b) {
+              return _width.data<int>()[a] > _width.data<int>()[b];
+            });
+  int max_width = width_data[idx_sorted_by_width_cpu_data[0]];
+
+  // start of reorganizing the input
+  std::vector<size_t> new_offset;
+  new_offset.resize(max_width + 1);
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width_data[idx_sorted_by_width_cpu_data[k]] > last_width) {
+        sub_row = width_data[idx_sorted_by_width_cpu_data[k]] - last_width;
+        sub_col = k + 1;
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width_data[idx_sorted_by_width_cpu_data[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  auto* _layout_input = new Tensor();
+  auto* _layout_input_gpu = param.layout_input;
+  if (_input->dims().size() == 1) {
+    // _layout_input.reshape_batch_sequence({dim0}, new_offset);
+    LOG(FATAL) << "_input->dims().size() = 1, error.";
+  } else {
+    // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    _layout_input->set_lod(new_lod);
+    _layout_input->Resize({dim0, dim1});
+    _layout_input_gpu->set_lod(new_lod);
+    _layout_input_gpu->Resize({dim0, dim1});
+  }
+
+  auto* new_emb = _layout_input->mutable_data<float>();
+  auto* input_cpu = new Tensor();
+  input_cpu->Resize(_input->dims());
+  auto* input_cpu_data = input_cpu->mutable_data<float>();
+  TargetW::MemcpyAsync(input_cpu_data,
+                       _input->data<float>(),
+                       _input->numel() * sizeof(float),
+                       IoDirection::DtoH,
+                       cuda_stream);
+  for (int i = 0; i < max_width; i++) {
+    int w = new_offset[i + 1] - new_offset[i];
+    auto* emb_start = new_emb + dim1 * new_offset[i];
+    for (int j = 0; j < w; ++j) {
+      memcpy(emb_start + dim1 * j,
+             input_cpu_data + dim1 * offset[idx_sorted_by_width_cpu_data[j]] +
+                 dim1 * i,
+             dim1 * sizeof(float));
+    }
+  }
+
+  auto* _layout_input_gpu_data =
+      _layout_input_gpu->mutable_data<float>(TARGET(kCUDA));
+  TargetW::MemcpyAsync(_layout_input_gpu_data,
+                       new_emb,
+                       _layout_input->numel() * sizeof(float),
+                       IoDirection::HtoD,
+                       cuda_stream);
+  delete _layout_input;
+  delete input_cpu;
+}
+
+void SearchGrnnCompute::CopyBack(float* from, float* to, int step) {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto* _input = param.x;
+  auto* _layout_input = param.layout_input;
+
+  const auto& offset = _input->lod()[0];
+  const auto& new_offset = _layout_input->lod()[0];
+  const auto* idx_sorted_by_width_cpu_data =
+      idx_sorted_by_width_cpu->data<int>();
+  for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+    int w = new_offset[i + 1] - new_offset[i];
+    for (int j = 0; j < w; j++) {
+      TargetW::MemcpyAsync(
+          to + step * (offset[idx_sorted_by_width_cpu_data[j]] + i),
+          from + (new_offset[i] + j) * step,
+          step * sizeof(float),
+          IoDirection::DtoD,
+          stream);
+    }
+  }
+}
+
+void SearchGrnnCompute::Run() {
+  CHECK(ctx_) << "running context should be set first";
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* _buffer = param.tmp_buffer;
+  int _cap_h = param.num_hidden;
+  int _cap_e = param.num_input;
+
+  int _cap_l = bottom->dims()[0];
+  int batch = bottom->lod()[0].size() - 1;
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->mutable_data<float>(TARGET(kCUDA));
+
+  const auto* dense_e2h = wi->data<float>();
+  const auto* dense_h2h = wh->data<float>();
+
+  const auto* e2h = dense_e2h;
+  const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+  const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+  const auto* h2h = dense_h2h;
+  const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+  const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+  PrepareLayout(bottom);
+
+  auto* _layout_input = param.layout_input;
+  auto* new_emb = _layout_input->data<float>();
+  const auto& new_offset = _layout_input->lod()[0];
+  int max_width = _layout_input->lod()[0].size() - 1;
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  _buffer->Resize({20, _cap_l, _cap_h});
+  auto* buffer_data = _buffer->mutable_data<float>(TARGET(kCUDA));
+  auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
+  auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
+  auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
+  auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
+  auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
+  auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
+  auto* r = buffer_data + 6 * _cap_l * _cap_h;
+  auto* z = buffer_data + 7 * _cap_l * _cap_h;
+  auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
+
+  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
+  gemm_impl_->run(1.0f, 0.0f, new_emb, e2h, w_x_e, &context);
+  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
+  gemm_impl_->run(1.0f, 0.0f, new_emb, e2hr, wr_x_e, &context);
+  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
+  gemm_impl_->run(1.0f, 0.0f, new_emb, e2hz, wz_x_e, &context);
+
+  // precompute hidden0
+  int num = batch * _cap_h;
+  int threads = 512;
+  int blocks = (num + threads - 1) / threads;
+  PreComputeKernel<<<blocks, threads, 0, stream>>>(
+      num, w_x_e, wz_x_e, tilde, z, hidden);
+
+  // recurrence
+  for (int i = 1; i < max_width; i++) {
+    int w_tm1 = new_offset[i] - new_offset[i - 1];
+    int w = new_offset[i + 1] - new_offset[i];
+
+    // precompute hidden i-1 to hidden i
+    auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
+
+    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
+    gemm_impl_->run(
+        1.0f, 0.0f, htm1, h2h, u_x_h + new_offset[i] * _cap_h, &context);
+    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
+    gemm_impl_->run(
+        1.0f, 0.0f, htm1, h2hr, ur_x_h + new_offset[i] * _cap_h, &context);
+    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
+    gemm_impl_->run(
+        1.0f, 0.0f, htm1, h2hz, uz_x_h + new_offset[i] * _cap_h, &context);
+
+    // compute the gate and hidden
+    int start = new_offset[i] * _cap_h;
+    int end = (new_offset[i] + w) * _cap_h;
+    PostComputeKernel<<<blocks, threads, 0, stream>>>(start,
+                                                      end,
+                                                      _cap_h,
+                                                      w_tm1,
+                                                      wr_x_e,
+                                                      ur_x_h,
+                                                      wz_x_e,
+                                                      uz_x_h,
+                                                      w_x_e,
+                                                      u_x_h,
+                                                      r,
+                                                      z,
+                                                      tilde,
+                                                      hidden);
+  }
+
+  CopyBack(hidden, top_hidden, _cap_h);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchGrnnCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Wi",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Wh",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("tmp_buffer",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("layout_input",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_grnn_compute.h b/lite/kernels/cuda/search_grnn_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..73d84635d06f578f68bd844fe275d99595e70fc8
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchGrnnCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+  using TargetW = TargetWrapper<TARGET(kCUDA)>;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SearchGrnnCompute() = default;
+
+ private:
+  std::shared_ptr<Tensor> idx_sorted_by_width_cpu;
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  void PrepareLayout(const Tensor* input);
+  void CopyBack(float* from, float* to, int step);
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_grnn_compute_test.cc b/lite/kernels/cuda/search_grnn_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08b96e1f1ecd57d10099b9566a5c0cd5e6e885d1
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute_test.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_grnn_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(search_grnn, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  SearchGrnnCompute kernel;
+  operators::SearchGrnnParam param;
+
+  int num_input = 6;
+  int num_hidden = 6;
+  int num_batch = 3;
+  Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  Tensor x_cpu, wi_cpu, wh_cpu, out_cpu, layout_input_cpu, tmp_buffer_cpu;
+  x_cpu.Resize({num_batch, num_input});
+  wi_cpu.Resize({3, num_hidden, num_input});
+  wh_cpu.Resize({3, num_hidden, num_hidden});
+  out_cpu.Resize({num_batch, num_hidden});
+  layout_input_cpu.Resize({num_batch, num_input});
+  tmp_buffer_cpu.Resize({20, num_batch, num_hidden});
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = static_cast<float>(i);
+  }
+  auto* wi_cpu_data = wi_cpu.mutable_data<float>();
+  for (int i = 0; i < wi_cpu.numel(); ++i) {
+    wi_cpu_data[i] = static_cast<float>(i);
+  }
+  auto* wh_cpu_data = wh_cpu.mutable_data<float>();
+  for (int i = 0; i < wh_cpu.numel(); ++i) {
+    wh_cpu_data[i] = static_cast<float>(i);
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  wi.Assign<float, lite::DDim, TARGET(kCUDA)>(wi_cpu_data, wi_cpu.dims());
+  wh.Assign<float, lite::DDim, TARGET(kCUDA)>(wh_cpu_data, wh_cpu.dims());
+
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  LOG(INFO) << "out_data:";
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+    LOG(INFO) << out_cpu_data[i];
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_group_padding_compute.cu b/lite/kernels/cuda/search_group_padding_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..697e53dbb68b09bec6c32ece73723d469a5cd9d6
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute.cu
@@ -0,0 +1,164 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_group_padding_compute.h"
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+template <typename Dtype>
+__global__ void ker_search_group_padding(Dtype* out_emb_padding_data,
+                                         Dtype* out_padding_data,
+                                         const Dtype* in_data,
+                                         const uint64_t* offset,
+                                         const int seq_num,
+                                         const int max_len,
+                                         const int emb_size,
+                                         const Dtype pad_id,
+                                         const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int seq_id = word_id / max_len;
+    int word_id_in_seq = word_id % max_len;
+    int cur_len = offset[seq_id + 1] - offset[seq_id];
+    if (word_id_in_seq < cur_len) {
+      out_emb_padding_data[tid] =
+          in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id];
+    } else {
+      out_emb_padding_data[tid] = 0.f;
+      if (emb_id == 0) {
+        out_padding_data[word_id] = pad_id;
+      }
+    }
+  }
+}
+
+void SearchGroupPaddingCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  const Tensor* x = param.x;
+  Tensor* out_emb_padding = param.out_emb_padding;
+  Tensor* out_new = param.out_new;
+  Tensor* out_padding = param.out_padding;
+  const float pad_id = static_cast<float>(param.pad_id);
+  const float* in_data = x->data<float>();
+  const auto& in_seq_offset = x->lod()[0];
+  int batch = in_seq_offset.size() - 1;
+  int max_seq = 0;
+  for (int i = 0; i < batch; ++i) {
+    if (in_seq_offset[i + 1] - in_seq_offset[i] > max_seq) {
+      max_seq = in_seq_offset[i + 1] - in_seq_offset[i];
+    }
+  }
+  std::vector<size_t> new_offset;
+  new_offset.resize(batch + 1);
+  for (int i = 0; i < batch + 1; ++i) {
+    new_offset[i] = i * max_seq;
+  }
+  std::vector<int64_t> x_dims = x->dims().Vectorize();
+  LoD out_emb_padding_lod;
+  out_emb_padding_lod.push_back(new_offset);
+  out_emb_padding->set_lod(out_emb_padding_lod);
+  out_emb_padding->Resize({batch * max_seq, x_dims[1]});
+  float* out_emb_padding_data =
+      out_emb_padding->mutable_data<float>(TARGET(kCUDA));
+
+  LoD out_new_lod;
+  out_new_lod.push_back(in_seq_offset);
+  out_new->set_lod(out_new_lod);
+  out_new->Resize({x_dims[0], 1});
+  float* out_new_data = out_new->mutable_data<float>(TARGET(kCUDA));
+
+  LoD out_padding_lod;
+  out_padding_lod.push_back(new_offset);
+  out_padding->set_lod(out_padding_lod);
+  out_padding->Resize({batch * max_seq, 1});
+  float* out_padding_data = out_padding->mutable_data<float>(TARGET(kCUDA));
+
+  const int count = out_emb_padding->numel();
+  const auto& out_emb_padding_seq_offset = out_emb_padding->lod()[0];
+  int max_len = out_emb_padding_seq_offset[1];
+  int seq_num = out_emb_padding_seq_offset.size() - 1;
+  int emb_size = x->dims()[1];
+  _in_seq_offset.Resize({seq_num + 1, 1, 1, 1});
+  uint64_t* offset_data = _in_seq_offset.mutable_data<uint64_t>(TARGET(kCUDA));
+
+  TargetWrapperCuda::MemcpyAsync(offset_data,
+                                 in_seq_offset.data(),
+                                 sizeof(uint64_t) * in_seq_offset.size(),
+                                 IoDirection::HtoD,
+                                 cuda_stream);
+
+  TargetWrapperCuda::MemsetSync(
+      out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float));
+  TargetWrapperCuda::MemsetSync(
+      out_padding_data,
+      0,
+      out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float));
+
+  ker_search_group_padding<
+      float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream>>>(
+      out_emb_padding_data,
+      out_padding_data,
+      in_data,
+      offset_data,
+      seq_num,
+      max_len,
+      emb_size,
+      pad_id,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_group_padding,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchGroupPaddingCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out_emb_padding",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_new",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_padding",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_group_padding_compute.h b/lite/kernels/cuda/search_group_padding_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..88391e6d652b92571d11b321f12288155665d9da
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchGroupPaddingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGroupPaddingParam;
+
+  void Run() override;
+  virtual ~SearchGroupPaddingCompute() = default;
+
+ private:
+  lite::Tensor _in_seq_offset;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_group_padding_compute_test.cc b/lite/kernels/cuda/search_group_padding_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b831780c876dcc9d910cbf48a66bf0d1ec7a5bb2
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_group_padding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(search_group_padding_cuda, run_test) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  lite::Tensor x, x_cpu, x_ref;
+  lite::Tensor out_emb_padding, out_emb_padding_cpu, out_emb_padding_ref;
+  lite::Tensor out_new, out_new_cpu, out_new_ref;
+  lite::Tensor out_padding, out_padding_cpu, out_padding_ref;
+
+  int x_dims0 = 2;
+  int x_dims1 = 3;
+
+  x.Resize({x_dims0, x_dims1});
+  x_cpu.Resize({x_dims0, x_dims1});
+  x_ref.Resize({x_dims0, x_dims1});
+  out_emb_padding.Resize({1, x_dims1});
+  out_emb_padding_cpu.Resize({1, x_dims1});
+  out_emb_padding_ref.Resize({1, x_dims1});
+  out_new.Resize({x_dims0, 1});
+  out_new_cpu.Resize({x_dims0, 1});
+  out_new_ref.Resize({x_dims0, 1});
+  out_padding.Resize({1, 1});
+  out_padding_cpu.Resize({1, 1});
+  out_padding_ref.Resize({1, 1});
+
+  LoD x_lod{};
+  x_lod.push_back({0, 1});
+  x.set_lod(x_lod);
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* out_emb_padding_data =
+      out_emb_padding.mutable_data<float>(TARGET(kCUDA));
+  auto* out_emb_padding_cpu_data = out_emb_padding_cpu.mutable_data<float>();
+  auto* out_emb_padding_ref_data = out_emb_padding_ref.mutable_data<float>();
+  auto* out_new_data = out_new.mutable_data<float>(TARGET(kCUDA));
+  auto* out_new_cpu_data = out_new_cpu.mutable_data<float>();
+  auto* out_new_ref_data = out_new_ref.mutable_data<float>();
+  auto* out_padding_data = out_padding.mutable_data<float>(TARGET(kCUDA));
+  auto* out_padding_cpu_data = out_padding_cpu.mutable_data<float>();
+  auto* out_padding_ref_data = out_padding_ref.mutable_data<float>();
+
+  for (int64_t i = 0; i < x_cpu.dims().production(); i++) {
+    x_cpu_data[i] = static_cast<float>(i);
+    x_ref_data[i] = static_cast<float>(i);
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  out_emb_padding_ref_data[0] = 0.f;
+  out_emb_padding_ref_data[1] = 1.f;
+  out_emb_padding_ref_data[2] = 2.f;
+  out_new_ref_data[0] = 0.f;
+  out_new_ref_data[1] = 0.f;
+  out_padding_ref_data[0] = 0.f;
+
+  SearchGroupPaddingCompute sgp_kernel;
+  operators::SearchGroupPaddingParam param;
+
+  param.x = &x;
+  param.out_emb_padding = &out_emb_padding;
+  param.out_new = &out_new;
+  param.out_padding = &out_padding;
+
+  sgp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  sgp_kernel.SetContext(std::move(ctx));
+  sgp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(out_emb_padding_cpu_data,
+                          out_emb_padding_data,
+                          sizeof(float) * out_emb_padding.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(out_new_cpu_data,
+                          out_new_data,
+                          sizeof(float) * out_new.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(out_padding_cpu_data,
+                          out_padding_data,
+                          sizeof(float) * out_padding.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_emb_padding_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_emb_padding_cpu_data[i], out_emb_padding_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < out_new_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_new_cpu_data[i], out_new_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < out_padding_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_padding_cpu_data[i], out_padding_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_group_padding, kCUDA, kFloat, kNCHW, def);
diff --git a/lite/kernels/cuda/search_seq_depadding_compute.cu b/lite/kernels/cuda/search_seq_depadding_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ecadceab582ccebf765ef43edda49ed414354611
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_depadding_compute.cu
@@ -0,0 +1,115 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_seq_depadding_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__global__ void ker_sequence_depadding_fwd(Dtype* out_data,
+                                           const Dtype* in_data,
+                                           const int* seq_id_map,
+                                           const int seq_num,
+                                           const int max_len,
+                                           const int emb_size,
+                                           const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int seq_id = seq_id_map[word_id];
+    out_data[tid] = in_data[seq_id * emb_size + emb_id];
+  }
+}
+
+void SearchSeqDepaddingCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  auto* pad = param.pad;
+  auto* src = param.src;
+  auto* out = param.out;
+
+  auto* in_data = pad->data<float>();
+  out->Resize({src->dims()[0], pad->dims()[1]});
+  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
+  const int count = out->numel();
+
+  const auto& pad_seq_offset = pad->lod()[0];
+  const auto& src_seq_offset = src->lod()[0];
+  int max_len = pad_seq_offset[1];
+  int seq_num = pad_seq_offset.size() - 1;
+  int emb_size = pad->dims()[1];
+
+  LoD out_lod;
+  out_lod.push_back(src_seq_offset);
+  out->set_lod(out_lod);
+  std::vector<int> seq_id_map;
+  for (int i = 0; i < seq_num; i++) {
+    int cur_len = src_seq_offset[i + 1] - src_seq_offset[i];
+    for (int j = 0; j < cur_len; j++) {
+      seq_id_map.push_back(i * max_len + j);
+    }
+  }
+
+  int map_size = seq_id_map.size();
+  seq_id_map_tensor.Resize({map_size, 1, 1, 1});
+  int* seq_id_map_data = seq_id_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  TargetW::MemcpyAsync(seq_id_map_data,
+                       &seq_id_map[0],
+                       seq_id_map.size() * sizeof(int),
+                       IoDirection::HtoD,
+                       cuda_stream);
+
+  int threads = 512;
+  int blocks = (count + threads - 1) / threads;
+  ker_sequence_depadding_fwd<<<blocks, threads, 0, cuda_stream>>>(
+      out_data, in_data, seq_id_map_data, seq_num, max_len, emb_size, count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_seq_depadding,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchSeqDepaddingCompute,
+                     def)
+    .BindInput("Src",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Pad",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_seq_depadding_compute.h b/lite/kernels/cuda/search_seq_depadding_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..a06f39bee2d9078206ab05f7f5377a5598498620
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_depadding_compute.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchSeqDepaddingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SearchSeqDepaddingParam;
+  using TargetW = TargetWrapper<TARGET(kCUDA)>;
+
+  void Run() override;
+  virtual ~SearchSeqDepaddingCompute() = default;
+
+ private:
+  Tensor seq_id_map_tensor;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_seq_depadding_compute_test.cc b/lite/kernels/cuda/search_seq_depadding_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c23ff14ab7a27b53177b2d0e48710df55c59ae5
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_depadding_compute_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_seq_depadding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(search_seq_depadding, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  SearchSeqDepaddingCompute kernel;
+  operators::SearchSeqDepaddingParam param;
+
+  Tensor pad, src, out;
+  pad.Resize({2 * 3, 4});
+  src.Resize({3, 1});
+  out.Resize({3, 4});
+  LoD pad_lod{};
+  pad_lod.push_back({0, 4, 6});
+  pad.set_lod(pad_lod);
+  LoD src_lod{};
+  src_lod.push_back({0, 2, 3});
+  src.set_lod(src_lod);
+
+  Tensor pad_cpu, src_cpu, out_cpu;
+  pad_cpu.Resize({2 * 3, 4});
+  src_cpu.Resize({3, 1});
+  out_cpu.Resize({3, 4});
+
+  auto* pad_cpu_data = pad_cpu.mutable_data<float>();
+  auto* src_cpu_data = src_cpu.mutable_data<float>();
+  for (int i = 0; i < pad_cpu.numel(); ++i) {
+    pad_cpu_data[i] = static_cast<float>(i);
+  }
+
+  pad.Assign<float, lite::DDim, TARGET(kCUDA)>(pad_cpu_data, pad_cpu.dims());
+  src.Assign<float, lite::DDim, TARGET(kCUDA)>(src_cpu_data, src_cpu.dims());
+
+  param.pad = &pad;
+  param.src = &src;
+  param.out = &out;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+    // LOG(INFO) << out_cpu_data[i];
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_seq_fc_compute.cu b/lite/kernels/cuda/search_seq_fc_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3ac75afeeee772ed7486a47dde14b7a3af4085f
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_fc_compute.cu
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_seq_fc_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename dtype>
+__global__ void add_bias(int n,
+                         int output_size,
+                         const dtype* bias,
+                         dtype* dout) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int bias_index = index % output_size;
+  if (index < n) {
+    dout[index] = dout[index] + bias[bias_index];
+  }
+}
+
+void SearchSeqFcCompute::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+void SearchSeqFcCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(ctx_) << "running context should be set first";
+  auto& cuda_ctx = ctx_->template As<CUDAContext>();
+  auto cuda_stream = cuda_ctx.exec_stream();
+
+  auto x = param.x;
+  auto w = param.w;
+  auto b = param.b;
+  auto out = param.out;
+  auto out_size = param.out_size;
+  const auto x_dims = x->dims();
+  const auto w_dims = w->dims();
+  const auto out_dims = out->dims();
+  CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor.";
+  CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+  CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+  CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]";
+  CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size";
+  int M = x_dims[0];
+  int K = x_dims[1];
+  int N = w_dims[0];
+  auto x_data = x->data<float>();
+  auto w_data = w->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+
+  CHECK(gemm_impl_->init(false, true, M, N, K, &cuda_ctx));
+  gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &cuda_ctx);
+
+  if (b != nullptr) {
+    auto b_dims = b->dims();
+    CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+    CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+    auto b_data = b->mutable_data<float>();
+    int total_size = M * N;
+    add_bias<float><<<CUDA_GET_BLOCKS(total_size),
+                      CUDA_NUM_THREADS,
+                      0,
+                      cuda_stream>>>(total_size, N, b_data, out_data);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_seq_fc,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchSeqFcCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_seq_fc_compute.h b/lite/kernels/cuda/search_seq_fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..dff8ba2acfbe28fc72f095294ad5a140ed66f150
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_fc_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/types.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchSeqFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqFcParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SearchSeqFcCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_seq_fc_compute_test.cc b/lite/kernels/cuda/search_seq_fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..354d1bb5bc3b0f3ee4d102fb2ebce176041ba91b
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_seq_fc_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+void search_seq_fc_compute_ref(const operators::SearchSeqFcParam& param) {
+  auto x = param.x;
+  auto w = param.w;
+  auto b = param.b;
+  auto out = param.out;
+  auto out_size = param.out_size;
+  const auto x_dims = x->dims();
+  const auto w_dims = w->dims();
+  const auto& x_lod = x->lod();
+  CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+  CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+  const auto& x_lod_0 = x_lod[0];
+  CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+  CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+  int M = x_dims[0];
+  int K = x_dims[1];
+  int N = w_dims[0];
+  auto x_data = x->data<T>();
+  auto w_data = w->data<T>();
+  auto out_data = out->mutable_data<T>();
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      auto sum = static_cast<T>(0);
+      for (int l = 0; l < K; l++) {
+        T xv = x_data[i * K + l];
+        T wv = w_data[j * K + l];
+        sum += xv * wv;
+      }
+      out_data[i * N + j] = sum;
+    }
+  }
+
+  if (b != nullptr) {
+    auto b_dims = b->dims();
+    CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+    CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+    auto b_data = b->data<T>();
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        out_data[i * N + j] += b_data[j];
+      }
+    }
+  }
+}
+
+TEST(search_seq_fc_compute, normal) {
+  Env<TargetType::kCUDA>::Init();
+  for (auto x_lod_0 : {std::vector<uint64_t>({0, 1, 3}),
+                       std::vector<uint64_t>({0, 3, 4, 5})}) {
+    for (auto feature_size : {2, 9}) {
+      for (auto out_size : {3, 5}) {
+        for (auto has_bias : {true, false}) {
+          // infer x_dims, w_dims, b_dims and out_dims
+          DDim x_dims({static_cast<int64_t>(x_lod_0.back()), feature_size});
+          DDim w_dims({out_size, feature_size});
+          DDim b_dims({has_bias ? out_size : 0});
+          DDim out_dims({static_cast<int64_t>(x_lod_0.back()), out_size});
+          LoD x_lod;
+          x_lod.push_back(x_lod_0);
+          LoD out_lod;
+          out_lod.push_back(x_lod_0);
+          // prepare input&output tensors
+          Tensor x_dev, x_host, w_dev, w_host, b_dev, b_host, out_dev, out_host,
+              out_ref;
+          x_host.Resize(x_dims);
+          w_host.Resize(w_dims);
+          b_host.Resize(b_dims);
+          out_host.Resize(out_dims);
+          x_dev.Resize(x_dims);
+          w_dev.Resize(w_dims);
+          b_dev.Resize(b_dims);
+          out_dev.Resize(out_dims);
+          out_ref.Resize(out_dims);
+          x_host.set_lod(x_lod);
+          out_host.set_lod(out_lod);
+          x_dev.set_lod(x_lod);
+          out_dev.set_lod(out_lod);
+          out_ref.set_lod(out_lod);
+          auto out_dev_data = out_dev.mutable_data<float>(TARGET(kCUDA));
+          auto x_host_data = x_host.mutable_data<float>();
+          auto w_host_data = w_host.mutable_data<float>();
+          auto out_host_data = out_host.mutable_data<float>();
+          auto out_ref_data = out_ref.mutable_data<float>();
+          for (int i = 0; i < x_host.dims().production(); i++) {
+            x_host_data[i] = i * 0.125f;
+          }
+          for (int i = 0; i < w_host.dims().production(); i++) {
+            w_host_data[i] = i * 0.5f;
+          }
+          x_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(x_host_data,
+                                                         x_host.dims());
+          w_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(w_host_data,
+                                                         w_host.dims());
+          // prepare cuda context, initialize param, and run kernel
+          operators::SearchSeqFcParam param;
+          param.x = &x_dev;
+          param.w = &w_dev;
+          param.out = &out_dev;
+          param.out_size = out_size;
+          if (has_bias) {
+            auto b_host_data = b_host.mutable_data<float>();
+            for (int i = 0; i < b_host.dims().production(); i++) {
+              b_host_data[i] = i * 0.5f;
+            }
+            b_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(b_host_data,
+                                                           b_host.dims());
+            param.b = &b_dev;
+          }
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          auto& cuda_ctx = ctx->As<CUDAContext>();
+          cuda_ctx.InitOnce();
+          int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+          cuda_ctx.Init(dev_id);
+          SearchSeqFcCompute search_seq_fc;
+          search_seq_fc.SetParam(param);
+          search_seq_fc.SetContext(std::move(ctx));
+          search_seq_fc.Launch();
+          cudaDeviceSynchronize();
+          CopySync<TARGET(kCUDA)>(out_host_data,
+                                  out_dev_data,
+                                  sizeof(float) * out_dev.dims().production(),
+                                  IoDirection::DtoH);
+          // run reference
+          param.x = &x_host;
+          param.w = &w_host;
+          param.out = &out_ref;
+          if (has_bias) {
+            param.b = &b_host;
+          }
+          search_seq_fc_compute_ref<float>(param);
+          // verify result
+          for (int i = 0; i < out_ref.dims().production(); i++) {
+            EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.cu b/lite/kernels/cuda/sequence_arithmetic_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7593632a14acd0cbec548dc5b9d3a096c4c7f38d
--- /dev/null
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu
@@ -0,0 +1,249 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_arithmetic_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename Dtype>
+__global__ void ker_arithmetic_sum(Dtype* out_data,
+                                   const Dtype* in_data_0,
+                                   const Dtype* in_data_1,
+                                   const int* offset_0,
+                                   const int* offset_1,
+                                   const int* word_id_to_seq_id,
+                                   const int seq_num,
+                                   const int inner_size,
+                                   const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % inner_size;
+    int word_id = tid / inner_size;
+    int seq_id = word_id_to_seq_id[word_id];
+    int word_id_in_cur_seq = word_id - offset_0[seq_id];
+    int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id];
+    if (word_id_in_cur_seq < seq_len_1) {
+      out_data[tid] =
+          in_data_0[tid] +
+          in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size +
+                    emb_id];
+    } else {
+      out_data[tid] = in_data_0[tid];
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ker_arithmetic_sub(Dtype* out_data,
+                                   const Dtype* in_data_0,
+                                   const Dtype* in_data_1,
+                                   const int* offset_0,
+                                   const int* offset_1,
+                                   const int* word_id_to_seq_id,
+                                   const int seq_num,
+                                   const int inner_size,
+                                   const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % inner_size;
+    int word_id = tid / inner_size;
+    int seq_id = word_id_to_seq_id[word_id];
+    int word_id_in_cur_seq = word_id - offset_0[seq_id];
+    int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id];
+    if (word_id_in_cur_seq < seq_len_1) {
+      out_data[tid] =
+          in_data_0[tid] -
+          in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size +
+                    emb_id];
+    } else {
+      out_data[tid] = in_data_0[tid];
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ker_arithmetic_mul(Dtype* out_data,
+                                   const Dtype* in_data_0,
+                                   const Dtype* in_data_1,
+                                   const int* offset_0,
+                                   const int* offset_1,
+                                   const int* word_id_to_seq_id,
+                                   const int seq_num,
+                                   const int inner_size,
+                                   const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % inner_size;
+    int word_id = tid / inner_size;
+    int seq_id = word_id_to_seq_id[word_id];
+    int word_id_in_cur_seq = word_id - offset_0[seq_id];
+    int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id];
+    if (word_id_in_cur_seq < seq_len_1) {
+      out_data[tid] =
+          in_data_0[tid] *
+          in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size +
+                    emb_id];
+    } else {
+      out_data[tid] = in_data_0[tid];
+    }
+  }
+}
+
+void SequenceArithmeticCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto x_data = param.X->data<float>();
+  auto x_lod = param.X->lod()[0];
+  auto y_data = param.Y->data<float>();
+  auto y_lod = param.Y->lod()[0];
+  auto out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  offset_x.Resize({static_cast<int64_t>(x_lod.size())});
+  auto offset_x_data = offset_x.mutable_data<int>(TARGET(kCUDA));
+
+  offset_y.Resize({static_cast<int64_t>(y_lod.size())});
+  auto offset_y_data = offset_y.mutable_data<int>(TARGET(kCUDA));
+
+  word_id_to_seq_id.Resize({param.X->numel()});
+  auto word_id_to_seq_id_data =
+      word_id_to_seq_id.mutable_data<int>(TARGET(kCUDA));
+
+  std::vector<int> word_seq_map;
+  for (int i = 0; i < x_lod.size() - 1; i++) {
+    for (int j = x_lod[i]; j < x_lod[i + 1]; j++) {
+      word_seq_map.push_back(i);
+    }
+  }
+
+  std::vector<int> offset_x_data_cpu(x_lod.size(), 0);
+  auto x_lod_data = x_lod.data();
+  for (int i = 0; i < offset_x_data_cpu.size(); i++) {
+    offset_x_data_cpu[i] = x_lod_data[i];
+  }
+
+  std::vector<int> offset_y_data_cpu(y_lod.size(), 0);
+  auto y_lod_data = y_lod.data();
+  for (int i = 0; i < offset_y_data_cpu.size(); i++) {
+    offset_y_data_cpu[i] = y_lod_data[i];
+  }
+
+  TargetWrapperCuda::MemcpyAsync(offset_x_data,
+                                 offset_x_data_cpu.data(),
+                                 sizeof(int) * x_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  TargetWrapperCuda::MemcpyAsync(offset_y_data,
+                                 offset_y_data_cpu.data(),
+                                 sizeof(int) * y_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  TargetWrapperCuda::MemcpyAsync(word_id_to_seq_id_data,
+                                 word_seq_map.data(),
+                                 sizeof(int) * word_seq_map.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  int seq_num = x_lod.size() - 1;
+  int count = param.X->numel();
+  int inner_size = param.X->dims()[1];
+  switch (param.op_type) {
+    case 1:  // sum
+      ker_arithmetic_sum<
+          float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+          out_data,
+          x_data,
+          y_data,
+          offset_x_data,
+          offset_y_data,
+          word_id_to_seq_id_data,
+          seq_num,
+          inner_size,
+          count);
+      break;
+    case 2:  // sub
+      ker_arithmetic_sub<
+          float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+          out_data,
+          x_data,
+          y_data,
+          offset_x_data,
+          offset_y_data,
+          word_id_to_seq_id_data,
+          seq_num,
+          inner_size,
+          count);
+      break;
+    case 3:  // mul
+      ker_arithmetic_mul<
+          float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+          out_data,
+          x_data,
+          y_data,
+          offset_x_data,
+          offset_y_data,
+          word_id_to_seq_id_data,
+          seq_num,
+          inner_size,
+          count);
+      break;
+    default:
+      break;
+  }
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_arithmetic,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+REGISTER_LITE_KERNEL(search_seq_arithmetic,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.h b/lite/kernels/cuda/sequence_arithmetic_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..a180c50eaa810511f8d72902e81bcd9abdaca31e
--- /dev/null
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override;
+  virtual ~SequenceArithmeticCompute() = default;
+
+ private:
+  lite::Tensor offset_x;
+  lite::Tensor offset_y;
+  lite::Tensor word_id_to_seq_id;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute_test.cc b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0746d375d5c43d68cfad1896e7a3ab6178e2c35
--- /dev/null
+++ b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_arithmetic_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void sequence_arithmetic_compute_ref(const Tensor& x,
+                                     const Tensor& y,
+                                     Tensor* out,
+                                     int op_type) {
+  auto x_data = x.data<float>();
+  auto y_data = y.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  auto x_seq_offset = x.lod()[0];
+  auto y_seq_offset = y.lod()[0];
+  int seq_num = x_seq_offset.size() - 1;
+  int inner_size = x.numel() / x.dims()[0];
+
+  for (int i = 0; i < seq_num; i++) {
+    int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+    int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+    auto input_x = x_data + x_seq_offset[i] * inner_size;
+    auto input_y = y_data + y_seq_offset[i] * inner_size;
+    auto t_out = out_data + x_seq_offset[i] * inner_size;
+    int len = std::min(len_x, len_y);
+    for (int j = 0; j < len; j++) {
+      switch (op_type) {
+        case 1:
+          t_out[j] = input_x[j] + input_y[j];
+          break;
+        case 2:
+          t_out[j] = input_x[j] - input_y[j];
+          break;
+        case 3:
+          t_out[j] = input_x[j] * input_y[j];
+          break;
+        default:
+          break;
+      }
+    }
+    if (len_x > len) {
+      memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len));
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& x_lod) {
+  x->Resize({static_cast<int64_t>(x_lod[0].back()), 3});
+  x->set_lod(x_lod);
+  auto x_data = x->mutable_data<float>();
+  for (int i = 0; i < x->numel(); i++) {
+    x_data[i] = (i - x->numel() / 2) * 1.1;
+  }
+}
+
+TEST(sequence_arithmetic_cuda, run_test) {
+  lite::Tensor x, y, x_cpu, y_cpu;
+  lite::Tensor out, out_cpu, out_ref;
+  lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}};
+
+  prepare_input(&x_cpu, x_lod);
+  prepare_input(&y_cpu, y_lod);
+
+  x.Resize(x_cpu.dims());
+  x.set_lod(x_cpu.lod());
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  y.Resize(y_cpu.dims());
+  y.set_lod(y_cpu.lod());
+  auto y_cpu_data = y_cpu.mutable_data<float>();
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  operators::SequenceArithmeticParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.op_type = 1;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  SequenceArithmeticCompute sequence_arithmetic;
+  sequence_arithmetic.SetContext(std::move(ctx));
+  sequence_arithmetic.SetParam(param);
+  sequence_arithmetic.Run();
+  cudaDeviceSynchronize();
+
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  out_cpu.Resize(out.dims());
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  sequence_arithmetic_compute_ref(x_cpu, y_cpu, &out_ref, param.op_type);
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_concat_compute.cu b/lite/kernels/cuda/sequence_concat_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d4390046b01d6411bc8528e86083d5059eb4d449
--- /dev/null
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_concat_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+template <typename Dtype>
+__global__ void ker_sequence_concat(Dtype* out_data,
+                                    const uint64_t* in_locate_data,
+                                    const int* o2i_map,
+                                    const int* o2i_w_map,
+                                    const int seq_num,
+                                    const int emb_size,
+                                    const int count) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int input_id = o2i_map[word_id];
+    int cur_work_id = o2i_w_map[word_id];
+    const Dtype* in_data = reinterpret_cast<const Dtype*>(
+        reinterpret_cast<uintptr_t>(in_locate_data[input_id]));
+    out_data[tid] = in_data[cur_work_id * emb_size + emb_id];
+  }
+}
+
+void SequenceConcatCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  int seq_num = param.X[0]->lod()[0].size() - 1;
+  const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0];
+  std::vector<uint64_t> in_locate_vec;
+  for (size_t i = 0; i < param.X.size(); ++i) {
+    in_locate_vec.push_back(
+        reinterpret_cast<uintptr_t>(param.X[i]->data<float>()));
+  }
+  in_locate_tensor.Resize({static_cast<int64_t>(in_locate_vec.size())});
+
+  std::vector<int> out2in_map;
+  std::vector<int> out2in_word_map;
+  for (int i = 0; i < seq_num; ++i) {
+    for (int j = 0; j < param.X.size(); ++j) {
+      auto offset = param.X[j]->lod()[0];
+      int cur_len = offset[i + 1] - offset[i];
+      for (int k = 0; k < cur_len; ++k) {
+        out2in_map.push_back(j);
+        out2in_word_map.push_back(offset[i] + k);
+      }
+    }
+  }
+  int word_num = out2in_map.size();
+  out2in_map_tensor.Resize({word_num});
+  out2in_word_map_tensor.Resize({word_num});
+  int* gpu_o2i_map_data = out2in_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  int* gpu_o2i_w_map_data =
+      out2in_word_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  uint64_t* gpu_in_locate_data =
+      in_locate_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
+
+  TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data,
+                                 out2in_map.data(),
+                                 sizeof(int) * out2in_map.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data,
+                                 out2in_word_map.data(),
+                                 sizeof(int) * out2in_word_map.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data,
+                                 in_locate_vec.data(),
+                                 sizeof(uint64_t) * in_locate_vec.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  param.Out->set_lod(ConcatLoD<float>(param.X));
+
+  int count = param.X[0]->numel();
+  for (int i = 1; i < param.X.size(); ++i) {
+    count += param.X[i]->numel();
+  }
+
+  int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+  ker_sequence_concat<float><<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
+      out_data,
+      gpu_in_locate_data,
+      gpu_o2i_map_data,
+      gpu_o2i_w_map_data,
+      seq_num,
+      emb_size,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequenceConcatCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_concat_compute.h b/lite/kernels/cuda/sequence_concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1737c18dd35976572efa1b62fadefed906b0ceb5
--- /dev/null
+++ b/lite/kernels/cuda/sequence_concat_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void Run() override;
+  virtual ~SequenceConcatCompute() = default;
+
+ private:
+  lite::Tensor out2in_map_tensor;
+  lite::Tensor out2in_word_map_tensor;
+  lite::Tensor in_locate_tensor;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_concat_compute_test.cc b/lite/kernels/cuda/sequence_concat_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..477dc48dbbdfe7a1453bbb5c811d6897347fee53
--- /dev/null
+++ b/lite/kernels/cuda/sequence_concat_compute_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_concat_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+namespace {
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<float>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
+                                lite::Tensor* out) {
+  std::vector<int64_t> out_dims;
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  for (const auto& tensor : xs) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD(xs, &x_in_order));
+
+  int num = x_in_order.size();
+  std::vector<int64_t> input_cols(num);
+  for (int i = 0; i < num; ++i) {
+    input_cols[i] = x_in_order[i].numel();
+  }
+  float* out_data = out->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = x_in_order[j].data<float>();
+    memcpy(out_data + col_idx, input_data, sizeof(float) * col_len);
+    col_idx += col_len;
+  }
+}
+
+#define PREPARE_INPUT_DATA(name)                                 \
+  name.Resize({name##_lod_len, feature_len});                    \
+  name##_cpu.Resize({name##_lod_len, feature_len});              \
+  name##_ref.Resize({name##_lod_len, feature_len});              \
+  name.set_lod(lod_info_##name);                                 \
+  name##_cpu.set_lod(lod_info_##name);                           \
+  name##_ref.set_lod(lod_info_##name);                           \
+  float* name##_cpu_data = name##_cpu.mutable_data<float>();     \
+  float* name##_ref_data = name##_ref.mutable_data<float>();     \
+  for (int i = 0; i < name##_cpu.numel(); ++i) {                 \
+    name##_cpu_data[i] = (i - 2.0) * 1.0;                        \
+    name##_ref_data[i] = (i - 2.0) * 1.0;                        \
+  }                                                              \
+  name.Assign<float, lite::DDim, TARGET(kCUDA)>(name##_cpu_data, \
+                                                name##_cpu.dims());
+
+#define PREPARE_OUTPUT_INFO(name)              \
+  name##_cpu.Resize({y_lod_len, feature_len}); \
+  name##_ref.Resize({y_lod_len, feature_len}); \
+  name.Resize({y_lod_len, feature_len});       \
+  float* name##_cpu_data = name##_cpu.mutable_data<float>();
+
+}  // namespace
+
+TEST(sequence_concat_cuda, normal) {
+  SequenceConcatCompute seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::SequenceConcatParam param;
+  lite::Tensor x1, x2, x3, x1_cpu, x2_cpu, x3_cpu, x1_ref, x2_ref, x3_ref;
+  lite::Tensor y, y_cpu, y_ref;
+
+  int32_t x1_lod_len = 10, feature_len = 4;
+  int32_t x2_lod_len = 4, x3_lod_len = 8;
+  int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len;
+  LoD lod_info_x1{{0, 3, 5, 6, 10}};
+  LoD lod_info_x2{{0, 1, 2, 3, 4}};
+  LoD lod_info_x3{{0, 2, 4, 6, 8}};
+  LoD lod_info_y{{0, 0, 0, 0, 0}};
+  for (size_t i = 0; i < lod_info_x1[0].size(); ++i) {
+    lod_info_y[0][i] =
+        lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i];
+  }
+
+  PREPARE_INPUT_DATA(x1);
+  PREPARE_INPUT_DATA(x2);
+  PREPARE_INPUT_DATA(x3);
+  PREPARE_OUTPUT_INFO(y);
+
+  param.X = std::vector<lite::Tensor*>({&x1, &x2, &x3});
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+  cudaDeviceSynchronize();
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+
+  std::vector<lite::Tensor*> input_ref({&x1_ref, &x2_ref, &x3_ref});
+  sequence_concat_ref(input_ref, &y_ref);
+  float* y_ref_data = y_ref.mutable_data<float>();
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pool_compute.cu b/lite/kernels/cuda/sequence_pool_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..97876ec32fcc3ffc3d45ff8dbeafca90d6191b23
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
@@ -0,0 +1,258 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_pool_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__global__ void seq_pool_average_kernel(Dtype* dst,
+                                        const Dtype* src_in,
+                                        const int batch_size,
+                                        const uint64_t* seq_offset,
+                                        const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype sum = (Dtype)0;
+    for (int i = 0; i < in_slice_num; ++i) {
+      sum += src_in[i * slice_size];
+    }
+    dst[out_batch_id * slice_size + out_id] = sum / in_slice_num;
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_sum_kernel(Dtype* dst,
+                                    const Dtype* src_in,
+                                    const int batch_size,
+                                    const uint64_t* seq_offset,
+                                    const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype sum = (Dtype)0;
+    for (int i = 0; i < in_slice_num; ++i) {
+      sum += src_in[i * slice_size];
+    }
+    dst[out_batch_id * slice_size + out_id] = sum;
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_sqrt_kernel(Dtype* dst,
+                                     const Dtype* src_in,
+                                     const int batch_size,
+                                     const uint64_t* seq_offset,
+                                     const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype sum = (Dtype)0;
+    for (int i = 0; i < in_slice_num; ++i) {
+      sum += src_in[i * slice_size];
+    }
+    dst[out_batch_id * slice_size + out_id] = sum * rsqrtf(in_slice_num);
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_max_kernel(Dtype* dst,
+                                    const Dtype* src_in,
+                                    const int batch_size,
+                                    const uint64_t* seq_offset,
+                                    const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype max = src_in[0];
+    for (int i = 1; i < in_slice_num; ++i) {
+      Dtype val = src_in[i * slice_size];
+      if (val > max) {
+        max = val;
+      }
+    }
+    dst[out_batch_id * slice_size + out_id] = max;
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_last_kernel(Dtype* dst,
+                                     const Dtype* src_in,
+                                     const int batch_size,
+                                     const uint64_t* seq_offset,
+                                     const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_offset =
+        (static_cast<int>(seq_offset[out_batch_id + 1]) - 1) * slice_size;
+    dst[tid] = src_in[in_offset + out_id];
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_first_kernel(Dtype* dst,
+                                      const Dtype* src_in,
+                                      const int batch_size,
+                                      const uint64_t* seq_offset,
+                                      const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    dst[tid] = src_in[in_offset + out_id];
+  }
+}
+
+void SequencePoolCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  std::vector<uint64_t> seq_offset = param.X->lod()[0];
+  int batch_size = param.X->lod()[0].size() - 1;
+  int slice_size = param.Out->dims().production() / batch_size;
+
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  const float* in_data = param.X->data<float>();
+
+  lite::Tensor seq_offset_D;
+  seq_offset_D.Resize({static_cast<int64_t>(seq_offset.size())});
+  TargetWrapperCuda::MemcpyAsync(
+      seq_offset_D.mutable_data<uint64_t>(TARGET(kCUDA)),
+      seq_offset.data(),
+      sizeof(uint64_t) * seq_offset.size(),
+      IoDirection::HtoD,
+      stream);
+
+  if (param.pool_type == "MAX") {
+    seq_pool_max_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(out_data,
+                                           in_data,
+                                           batch_size,
+                                           seq_offset_D.data<uint64_t>(),
+                                           slice_size);
+  } else if (param.pool_type == "AVERAGE") {
+    seq_pool_average_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                     CUDA_NUM_THREADS,
+                                     0,
+                                     stream>>>(out_data,
+                                               in_data,
+                                               batch_size,
+                                               seq_offset_D.data<uint64_t>(),
+                                               slice_size);
+  } else if (param.pool_type == "SUM") {
+    seq_pool_sum_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(out_data,
+                                           in_data,
+                                           batch_size,
+                                           seq_offset_D.data<uint64_t>(),
+                                           slice_size);
+  } else if (param.pool_type == "SQRT") {
+    seq_pool_sqrt_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(out_data,
+                                            in_data,
+                                            batch_size,
+                                            seq_offset_D.data<uint64_t>(),
+                                            slice_size);
+  } else if (param.pool_type == "FIRST") {
+    seq_pool_first_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                   CUDA_NUM_THREADS,
+                                   0,
+                                   stream>>>(out_data,
+                                             in_data,
+                                             batch_size,
+                                             seq_offset_D.data<uint64_t>(),
+                                             slice_size);
+  } else if (param.pool_type == "LAST") {
+    seq_pool_last_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(out_data,
+                                            in_data,
+                                            batch_size,
+                                            seq_offset_D.data<uint64_t>(),
+                                            slice_size);
+  } else {
+    LOG(ERROR) << "pool type " << param.pool_type << " is not supoorted.";
+  }
+
+  std::vector<uint64_t> offset_new(static_cast<uint64_t>(batch_size + 1));
+
+  for (int i = 0; i <= batch_size; ++i) {
+    offset_new[i] = i;
+  }
+  std::vector<std::vector<uint64_t>> voffset_new;
+  voffset_new.push_back(offset_new);
+  param.Out->set_lod(voffset_new);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_pool,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequencePoolCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_pool_compute.h b/lite/kernels/cuda/sequence_pool_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..9309454d18d014045ac3bc7f189d2d8430949033
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequencePoolCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolParam;
+
+  void Run() override;
+  virtual ~SequencePoolCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pool_compute_test.cc b/lite/kernels/cuda/sequence_pool_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f2656cd1d6c4baa377d8f1d363ae5150113d42f
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_compute_test.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_pool_compute.h"
+#include <gtest/gtest.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(sequence_pool_cuda, normal) {
+  SequencePoolCompute seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  lite::Tensor x, x_cpu, out, out_cpu;
+  lite::LoD lod;
+  lod.push_back(std::vector<uint64_t>{0, 10});
+
+  x.set_lod(lod);
+  x_cpu.set_lod(lod);
+  const size_t second_dim = 8u;
+  std::vector<int64_t> input_shape{static_cast<int64_t>(lod[0].back()),
+                                   static_cast<int64_t>(second_dim)};
+  lite::DDim in_dims(input_shape);
+  x.Resize(in_dims);
+  x_cpu.Resize(in_dims);
+
+  const size_t out_first_dim = lod[0].size() - 1;
+  std::vector<int64_t> output_shape{static_cast<int64_t>(out_first_dim),
+                                    static_cast<int64_t>(second_dim)};
+  lite::DDim out_dims(output_shape);
+  out.Resize(out_dims);
+  out_cpu.Resize(out_dims);
+
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+
+  for (int64_t i = 0; i < x_cpu.dims().production(); i++) {
+    x_cpu_data[i] = 1.1f * i;
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  operators::SequencePoolParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::vector<std::string> pool_types(
+      {"MAX", "AVERAGE", "SUM", "SQRT", "FIRST", "LAST"});
+  std::map<std::string, std::vector<float>> type_map;
+  type_map["MAX"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9};
+  type_map["AVERAGE"] = {39.6, 40.7, 41.8, 42.9, 44, 45.1, 46.2, 47.3};
+  type_map["SUM"] = {396, 407, 418, 429, 440, 451, 462, 473};
+  type_map["SQRT"] = {
+      125.226, 128.705, 132.183, 135.662, 139.14, 142.619, 146.097, 149.576};
+  type_map["FIRST"] = {0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7};
+  type_map["LAST"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9};
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  seq_kernel.SetContext(std::move(ctx));
+  for (std::string pool_type : pool_types) {
+    param.pool_type = pool_type;
+    seq_kernel.SetParam(param);
+
+    seq_kernel.Run();
+    cudaDeviceSynchronize();
+
+    CopySync<TARGET(kCUDA)>(out_cpu_data,
+                            out_data,
+                            sizeof(float) * out_cpu.numel(),
+                            IoDirection::DtoH);
+
+    std::vector<float> ref_results = type_map[pool_type];
+
+    for (int i = 0; i < out_cpu.numel(); i++) {
+      EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-3);
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_reverse_compute.cu b/lite/kernels/cuda/sequence_reverse_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..68447fcebb1a6189f3a80d47ea29b0fca88267c8
--- /dev/null
+++ b/lite/kernels/cuda/sequence_reverse_compute.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_reverse_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__host__ __device__ inline size_t UpperBound(const T* x,
+                                             size_t num,
+                                             const T& val) {
+  // The following code is from
+  // https://en.cppreference.com/w/cpp/algorithm/upper_bound
+  auto* first = x;
+  int64_t count = static_cast<int64_t>(num);
+  while (count > 0) {
+    auto step = (count >> 1);
+    auto* it = first + step;
+    if (val < *it) {
+      count = step;
+    } else {
+      first = ++it;
+      count -= (step + 1);
+    }
+  }
+  return static_cast<size_t>(first - x);
+}
+
+template <typename T>
+__global__ void SequenceReverseKernelGridIsOne(
+    const T* x, T* y, const int64_t* lod, size_t lod_count, int64_t row_numel) {
+  int64_t idx = static_cast<int64_t>(threadIdx.x);
+  auto row_idx_x = idx / row_numel;
+  auto lod_idx = UpperBound(lod, lod_count, row_idx_x);
+  auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x);
+  auto idx_y = row_idx_y * row_numel + idx % row_numel;
+  y[idx_y] = x[idx];
+}
+
+template <typename T>
+__global__ void SequenceReverseKernel(const T* x,
+                                      T* y,
+                                      const int64_t* lod,
+                                      size_t lod_count,
+                                      int64_t row_numel,
+                                      size_t limit) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    auto row_idx_x = idx / row_numel;
+    auto lod_idx = UpperBound(lod, lod_count, row_idx_x);
+    auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x);
+    auto idx_y = row_idx_y * row_numel + idx % row_numel;
+    y[idx_y] = x[idx];
+  }
+}
+
+template <typename T, PrecisionType Ptype>
+void SequenceReverseCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  size_t limit = static_cast<size_t>(param.X->numel());
+  int64_t row_numel = static_cast<int64_t>(limit / param.X->dims()[0]);
+  const auto* x_data = param.X->template data<T>();
+  auto y_data = param.Out->template mutable_data<T>(TARGET(kCUDA));
+  CHECK_NE(x_data, y_data)
+      << "SequenceReverse Op does not support in-place operation";
+  const auto lod = param.X->lod()[param.X->lod().size() - 1];
+  const size_t lod_count = lod.size();
+  param.Out->set_lod(param.X->lod());
+
+  lod_cuda.Resize({static_cast<int64_t>(lod.size())});
+  int64_t* lod_data = lod_cuda.mutable_data<int64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(lod_data,
+                                 lod.data(),
+                                 sizeof(int64_t) * lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  constexpr int num_threads = 1024;
+  int block_size = limit <= num_threads ? limit : num_threads;
+  int grid_size = (limit + num_threads - 1) / num_threads;
+  if (grid_size == 1) {
+    SequenceReverseKernelGridIsOne<<<1, block_size, 0, stream>>>(
+        x_data, y_data, lod_data, lod_count, row_numel);
+  } else {
+    SequenceReverseKernel<<<grid_size, block_size, 0, stream>>>(
+        x_data, y_data, lod_data, lod_count, row_numel, limit);
+  }
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::cuda::SequenceReverseCompute<float,
+                                                            PRECISION(kFloat)>
+    ReverseFp32;
+
+typedef paddle::lite::kernels::cuda::SequenceReverseCompute<int64_t,
+                                                            PRECISION(kInt64)>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kFloat, kNCHW, ReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kInt64, kNCHW, ReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_reverse_compute.h b/lite/kernels/cuda/sequence_reverse_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b6199e020e64343632d3f7c90d2cbbae4eaa42b
--- /dev/null
+++ b/lite/kernels/cuda/sequence_reverse_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequenceReverseCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void Run() override;
+  virtual ~SequenceReverseCompute() = default;
+
+ private:
+  lite::Tensor lod_cuda;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_reverse_compute_test.cc b/lite/kernels/cuda/sequence_reverse_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3317b523037d913d6017041fbd357ed1dcf2d20a
--- /dev/null
+++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_reverse_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
+  const auto* x_data = x->data<float>();
+  auto seq_offset = x->lod()[x->lod().size() - 1];
+  int width = x->numel() / x->dims()[0];
+  auto* y_data = y->mutable_data<float>();
+  for (int i = 0; i < static_cast<int>(seq_offset.size()) - 1; ++i) {
+    auto start_pos = seq_offset[i];
+    auto end_pos = seq_offset[i + 1];
+    for (auto pos = start_pos; pos < end_pos; ++pos) {
+      auto cur_pos = end_pos - pos - 1 + start_pos;
+      std::memcpy(y_data + pos * width,
+                  x_data + cur_pos * width,
+                  width * sizeof(float));
+    }
+  }
+}
+
+TEST(sequence_reverse_cuda, normal) {
+  SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::SequenceReverseParam param;
+  lite::Tensor x, x_cpu, x_ref;
+  lite::Tensor y, y_cpu, y_ref;
+
+  int32_t lod_len = 10, feature_len = 4;
+  LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}};
+
+  x.Resize({lod_len, feature_len});
+  x_cpu.Resize({lod_len, feature_len});
+  x_ref.Resize({lod_len, feature_len});
+  y.Resize({lod_len, feature_len});
+  y_cpu.Resize({lod_len, feature_len});
+  y_ref.Resize({lod_len, feature_len});
+  x.set_lod(lod_info);
+  x_cpu.set_lod(lod_info);
+  x_ref.set_lod(lod_info);
+  y.set_lod(lod_info);
+  y_cpu.set_lod(lod_info);
+  y_ref.set_lod(lod_info);
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = (i - 2.0) * 1.0;
+    x_ref_data[i] = (i - 2.0) * 1.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  param.X = &x;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+
+  sequence_reverse_ref(&x_ref, &y_ref);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ea3edb30d86e314a04aab7ceac358e4c57b5b6a
--- /dev/null
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <limits>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sequence_topk_avg_pooling_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename Dtype>
+__global__ void topk_avg_pooling_kernel_by_row_improve(
+    Dtype *output_data,
+    const Dtype *input,
+    const int *gpu_input_offset_l,
+    const int *gpu_input_offset_r,
+    const int topk_size,
+    const int *topks,
+    const int feat_map_num) {
+  int row =
+      gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x];  // 8
+  int col = gpu_input_offset_r[blockIdx.x + 1] -
+            gpu_input_offset_r[blockIdx.x];  // 30
+  int max_k = topks[topk_size - 1];
+  max_k = max_k < col ? max_k : col;
+
+  extern __shared__ Dtype smem[];  // H*W
+
+  const Dtype *fm_row_in_data = input;
+  for (int i = 0; i < blockIdx.x; ++i) {
+    int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i];
+    int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i];
+    fm_row_in_data += tmp_row * feat_map_num * tmp_col;
+  }
+  fm_row_in_data += blockIdx.y * row * col;
+
+  for (int i = threadIdx.x; i < row * col; i += blockDim.x) {
+    smem[i] = fm_row_in_data[i];
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < row; idx += blockDim.x) {
+    Dtype *fm_row_out_data =
+        output_data +
+        (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
+        blockIdx.y * topk_size;
+
+    Dtype *smem_start_col = smem + idx * col;
+
+    int counter = max_k;  // topk_size;
+    Dtype last_max_val = -20000.0;
+    while (counter) {
+      Dtype max_val = -10000.0;
+      int max_pos = 0;
+      int m = 0;
+      for (; m < col; m++) {
+        Dtype cur_data = smem_start_col[m];
+        if (cur_data > max_val) {
+          max_val = cur_data;
+          max_pos = m;
+          last_max_val = max_val;
+        }
+      }
+      if (max_val < -9999.0) {  // == -10000.0
+        max_val = last_max_val;
+      }
+      smem_start_col[max_pos] = -10000000.0;
+      int i = max_k - counter;
+      for (int c = 0; c < topk_size; c++) {
+        if (i <= topks[c] - 1) {
+          fm_row_out_data[c] += max_val;
+        }
+      }
+      counter--;
+    }
+    __syncthreads();
+    // compute avg
+    for (int i = 0; i < topk_size; i++) {
+      fm_row_out_data[i] = fm_row_out_data[i] / topks[i];
+    }
+  }
+}
+
+template <typename T>
+void SequenceTopkAvgPoolingCompute<T>::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+  int topk_num = param.topks.size();
+  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
+  _top_ks.Resize(top_ks_shape);
+  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
+                  &param.topks[0],
+                  sizeof(int) * topk_num,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  int width_offset_len = param.COLUMN->lod()[0].size();
+  lite::DDim width_offset_shape(
+      std::vector<int64_t>{width_offset_len, 1, 1, 1});
+  _width_offset.Resize(width_offset_shape);
+  std::vector<int> width_lod_0(width_offset_len, 0);
+  for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) {
+    width_lod_0[i] = static_cast<int>(param.COLUMN->lod()[0][i]);
+  }
+  cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
+                  &width_lod_0[0],
+                  sizeof(int) * width_offset_len,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  int height_offset_len = param.ROW->lod()[0].size();
+  lite::DDim height_offset_shape(
+      std::vector<int64_t>{height_offset_len, 1, 1, 1});
+  _height_offset.Resize(height_offset_shape);
+  std::vector<int> height_lod_0(height_offset_len, 0);
+  for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
+    height_lod_0[i] = static_cast<int>(param.ROW->lod()[0][i]);
+  }
+  cudaMemcpyAsync(_height_offset.mutable_data<int>(TARGET(kCUDA)),
+                  &height_lod_0[0],
+                  sizeof(int) * height_offset_len,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  const Tensor *x_tensor = param.X;
+  Tensor *out_tensor = param.Out;
+  const T *in_data = x_tensor->data<T>();
+  T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
+  TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data<T>(TARGET(kCUDA)),
+                                 0,
+                                 sizeof(T) * out_tensor->numel(),
+                                 cuda_stream);
+
+  int num = param.ROW->lod()[0].size() - 1;
+  int channel = param.channel_num;
+
+  const int *height_offset = _height_offset.data<int>();
+  const int *width_offset = _width_offset.data<int>();
+
+  int feat_map_size = 0;
+  for (size_t i = 0; i < height_lod_0.size() - 1; ++i) {
+    int height = height_lod_0[i + 1] - height_lod_0[i];
+    int width = width_lod_0[i + 1] - width_lod_0[i];
+    if (height * width > feat_map_size) {
+      feat_map_size = height * width;
+    }
+  }
+  dim3 blocks(num, channel);
+  dim3 threads(32, 1);
+  topk_avg_pooling_kernel_by_row_improve<
+      T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
+      out_data,
+      in_data,
+      height_offset,
+      width_offset,
+      param.topks.size(),
+      _top_ks.data<int>(),
+      param.channel_num);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    sequence_topk_avg_pooling,
+    kCUDA,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::cuda::SequenceTopkAvgPoolingCompute<float>,
+    def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("ROW",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("COLUMN",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("pos",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..321ec9cfce2b22e7ddfc5dab53060a7eaea01732
--- /dev/null
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void Run() override;
+
+  virtual ~SequenceTopkAvgPoolingCompute() = default;
+
+ protected:
+  lite::Tensor _height_offset;
+  lite::Tensor _width_offset;
+  lite::Tensor _top_ks;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu
index d8d2987524cd2e8f9c38aba4da3ff61a80bf53ce..6293f7295ec78f44705992182667b30f82728e09 100644
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -173,9 +173,10 @@ void SoftmaxCompute::Run() {
   cudaGetDeviceProperties(&deviceProp, device_id);
   size_t sharedmem_size = deviceProp.sharedMemPerBlock;
   int max_dimsize = sharedmem_size / sizeof(float) / threads;
-
   auto input_data = param.x->data<float>();
   auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
+  TargetWrapperCuda::MemsetSync(
+      output_data, 0, param.output->numel() * sizeof(float));
   if (axis_size <= max_dimsize) {
     int use_sharemem_size = axis_size * threads * sizeof(float);
     sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
@@ -194,7 +195,7 @@ void SoftmaxCompute::Run() {
     auto max_data = tmax_data.mutable_data<float>(TARGET(kCUDA));
     auto sum_data = tsum_data.mutable_data<float>(TARGET(kCUDA));
     //! firstly, get maximum data
-    float min_data = std::numeric_limits<float>::min();
+    float min_data = std::numeric_limits<float>::lowest();
     softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
                                                               input_data,
                                                               max_data,
@@ -217,7 +218,7 @@ void SoftmaxCompute::Run() {
         total_threads, output_data, sum_data, inner_num, outer_num, axis_size);
   }
   cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
@@ -244,3 +245,19 @@ REGISTER_LITE_KERNEL(softmax,
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SoftmaxCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2588a8f53b83363300000fca6ba8a11cf5d50b6
--- /dev/null
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -0,0 +1,263 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <vector>
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/cuda/var_conv_2d_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+
+template <typename Dtype>
+__global__ void var_im2col_gpu_kernel(const int n,
+                                      const Dtype* data_im,
+                                      const int height,
+                                      const int width,
+                                      const int kernel_h,
+                                      const int kernel_w,
+                                      const int pad_h,
+                                      const int pad_w,
+                                      const int stride_h,
+                                      const int stride_w,
+                                      const int height_col,
+                                      const int width_col,
+                                      Dtype* data_col) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int index = idx; index < n; index += blockDim.x * gridDim.x) {
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+
+    Dtype* data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const Dtype* data_im_ptr = data_im;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h_im = h_offset + i;
+        int w_im = w_offset + j;
+        *data_col_ptr =
+            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                ? data_im_ptr[i * width + j]
+                : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+void VarConv2DCompute::var_im2col(const cudaStream_t& stream) {
+  auto& param = this->Param<param_t>();
+  int input_channel = param.input_channel;
+  int kernel_h = param.kernel_h;
+  int kernel_w = param.kernel_w;
+  int stride_h = param.stride_h;
+  int stride_w = param.stride_w;
+  // auto* in_row = param.ROW;
+  // auto* in_col = param.COLUMN;
+  const auto* input = param.X;
+  auto* col = param.Col;
+
+  int batch = input->lod()[0].size() - 1;
+  const auto& bottom_offset = input->lod()[0];
+  // 2-D lod info.
+  // const auto& offset_x = in_col->lod()[0];
+  // const auto& offset_y = in_row->lod()[0];
+  const auto& offset_y = param.X->lod()[1];
+  const auto& offset_x = param.X->lod()[2];
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>(TARGET(kCUDA));
+  const auto* bottom_data = input->data<float>();
+
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int width_col = (width - 1) / stride_w + 1;
+    int height_col = (height - 1) / stride_h + 1;
+    const float* data_im = bottom_data + b_offset;
+    float* data_col = top_data + t_offset;
+
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int num_kernels = height_col * width_col * input_channel;
+    const int CUDA_NUM_BLOCKS =
+        (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+    var_im2col_gpu_kernel<
+        float><<<CUDA_NUM_BLOCKS, CUDA_NUM_THREADS, 0, stream>>>(
+        num_kernels,
+        data_im,
+        height,
+        width,
+        kernel_h,
+        kernel_w,
+        ((stride_h - 1) * height + kernel_h - 1) / 2,
+        ((stride_w - 1) * width + kernel_w - 1) / 2,
+        stride_h,
+        stride_w,
+        height_col,
+        width_col,
+        data_col);
+  }
+}
+
+void VarConv2DCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto* bottom = param.X;
+  // auto* in_row = param.ROW;
+  // auto* in_col = param.COLUMN;
+  auto* w = param.W;
+  auto* top = param.Out;
+  auto* col = param.Col;
+  int output_channel = param.output_channel;
+  int input_channel = param.input_channel;
+  int kernel_h = param.kernel_h;
+  int kernel_w = param.kernel_w;
+  int stride_h = param.stride_h;
+  int stride_w = param.stride_w;
+
+  var_im2col(stream);
+
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  // const auto& offset_x = in_col->lod()[0];
+  // const auto& offset_y = in_row->lod()[0];
+  const auto& offset_y = param.X->lod()[1];
+  const auto& offset_x = param.X->lod()[2];
+  std::vector<size_t> top_offset;
+  std::vector<int64_t> height_vector;
+  std::vector<int64_t> width_vector;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    height_vector.push_back(top_im_y);
+    width_vector.push_back(top_im_x);
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+
+  auto* top_data = top->mutable_data<float>(TARGET(kCUDA));
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+    float* out_data = top_data + top_offset[b];
+    const float* in_data = col_data + col->lod()[0][b];
+    gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+    gemm_impl_->init(false,
+                     false,
+                     w->dims()[0],
+                     height_vector[b] * width_vector[b],
+                     input_channel * kernel_h * kernel_w,
+                     &ctx);
+    gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx);
+  }
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::VarConv2DCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/var_conv_2d_compute.h b/lite/kernels/cuda/var_conv_2d_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0b8e30c509f9095960bee3720567c96a71e7336
--- /dev/null
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void Run() override;
+  virtual ~VarConv2DCompute() = default;
+
+ private:
+  void var_im2col(const cudaStream_t& stream);
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/var_conv_2d_compute_test.cc b/lite/kernels/cuda/var_conv_2d_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98e9c73cdd680edc03cf18b60444bd5b0f76274c
--- /dev/null
+++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc
@@ -0,0 +1,360 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/var_conv_2d_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+static void im2col_ref(const lite::Tensor& input,
+                       const lite::Tensor* in_row,
+                       const lite::Tensor* in_col,
+                       const int kernel_h,
+                       const int kernel_w,
+                       const int stride_h,
+                       const int stride_w,
+                       const int input_channel,
+                       lite::Tensor* col) {
+  int batch = input.lod()[0].size() - 1;
+  const auto& bottom_offset = input.lod()[0];
+  // 2-D lod info.
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>();
+  const auto* bottom_data = input.data<float>();
+
+  int kernel_win_size = kernel_h * kernel_w;
+  int half_kernel_h = kernel_h / 2;
+  int half_kernel_w = kernel_w / 2;
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int top_im_x = (width - 1) / stride_w + 1;
+    int top_im_y = (height - 1) / stride_h + 1;
+    int top_x = top_im_y * top_im_x;
+    for (int z = 0; z < input_channel; ++z) {
+      int row_offset = kernel_win_size * z;
+      int im_offset = z * width * height;
+      for (int y = 0; y < height; y += stride_h) {
+        for (int x = 0; x < width; x += stride_w) {
+          int col_offset = x / stride_w + y / stride_h * top_im_x;
+          for (int ky = 0; ky < kernel_h; ++ky) {
+            for (int kx = 0; kx < kernel_w; ++kx) {
+              int im_y = y + ky - half_kernel_h;
+              int im_x = x + kx - half_kernel_w;
+              if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] =
+                    bottom_data[b_offset + im_offset + im_y * width + im_x];
+              } else {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void naive_sgemm(const bool transpose_A,
+                        const bool transpose_B,
+                        const int M,
+                        const int N,
+                        const int K,
+                        const float alpha,
+                        const float* A,  // m x k (after transpose if TransA)
+                        const int lda,   // leading dimension of a
+                        const float* B,  // k x n (after transpose if TransB)
+                        const int ldb,   // leading dimension of b
+                        const float beta,
+                        float* C,  // m x n
+                        const int ldc) {
+  for (int m = 0; m < M; ++m) {
+    for (int k = 0; k < K; ++k) {
+      for (int n = 0; n < N; ++n) {
+        C[m * N + n] += beta * C[m * N + n];
+        size_t A_idx = 0, B_idx = 0;
+        if (transpose_A) {
+          A_idx = k * M + m;  // A is k x m
+        } else {
+          A_idx = m * K + k;  // A is m x k
+        }
+
+        if (transpose_B) {
+          B_idx = n * K + k;  // B is n x k
+        } else {
+          B_idx = k * N + n;  // B is k x n
+        }
+
+        C[m * N + n] += alpha * A[A_idx] * B[B_idx];
+      }
+    }
+  }
+}
+
+static void var_conv_2d_ref(const lite::Tensor* bottom,
+                            const lite::Tensor* w,
+                            const lite::Tensor* in_row,
+                            const lite::Tensor* in_col,
+                            const int kernel_h,
+                            const int kernel_w,
+                            const int stride_h,
+                            const int stride_w,
+                            const int input_channel,
+                            const int output_channel,
+                            lite::Tensor* top,
+                            lite::Tensor* col) {
+  im2col_ref(*bottom,
+             in_row,
+             in_col,
+             kernel_h,
+             kernel_w,
+             stride_h,
+             stride_w,
+             input_channel,
+             col);
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>();
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+
+    naive_sgemm(false,
+                false,
+                output_channel,
+                top_im_size,
+                input_channel * kernel_h * kernel_w,
+                1.0,
+                w_data,
+                input_channel * kernel_h * kernel_w,
+                col_data + col_offset[b],
+                top_im_size,
+                0.0,
+                top_data + top_offset[b],
+                top_im_size);
+  }
+}
+
+TEST(var_conv_2d_cuda, normal) {
+  VarConv2DCompute var_conv_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::VarConv2DParam param;
+
+  lite::Tensor X, W, ROW, COLUMN;
+  lite::Tensor x_cpu, w_cpu;
+  lite::Tensor Out, Col, out_cpu, col_cpu;
+  int kernel_h = 5, kernel_w = 5;
+  int stride_h = 1, stride_w = 1;
+  int input_channel = 5, output_channel = 5;
+
+  std::vector<int64_t> w_dims_vec;
+  w_dims_vec.push_back(output_channel);
+  w_dims_vec.push_back(input_channel * kernel_h * kernel_w);
+  W.Resize(w_dims_vec);
+  w_cpu.Resize(w_dims_vec);
+  auto* w_cpu_data = w_cpu.mutable_data<float>();
+  for (int i = 0; i < W.numel(); ++i) {
+    w_cpu_data[i] = i - 1.f;
+  }
+
+  std::vector<uint64_t> row_lod_vec{0, 10, 20};
+  LoD row_lod;
+  row_lod.push_back(row_lod_vec);
+  ROW.set_lod(row_lod);
+
+  std::vector<uint64_t> column_lod_vec{0, 10, 20};
+  LoD column_lod;
+  column_lod.push_back(column_lod_vec);
+  COLUMN.set_lod(column_lod);
+
+  int x_size = 0;
+  std::vector<uint64_t> x_lod_vec;
+  x_lod_vec.push_back(0);
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    x_lod_vec.push_back(x_lod_vec.back() + height * width);
+    x_size += height * width;
+  }
+  for (size_t i = 0; i < x_lod_vec.size(); ++i) {
+    x_lod_vec[i] *= input_channel;
+  }
+  x_size *= input_channel;
+  std::vector<int64_t> x_dims_vec{x_size, 1};
+  LoD x_lod;
+  x_lod.push_back(x_lod_vec);
+  x_lod.push_back(row_lod_vec);
+  x_lod.push_back(column_lod_vec);
+  X.Resize(x_dims_vec);
+  x_cpu.Resize(x_dims_vec);
+  X.set_lod(x_lod);
+  x_cpu.set_lod(x_lod);
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  for (int i = 0; i < X.numel(); ++i) {
+    x_cpu_data[i] = i % 20 * 1.f;
+  }
+
+  int sum_num = 0;
+  int out_sum_num = 0;
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    sum_num += height * width * input_channel * kernel_h * kernel_w;
+    out_sum_num += height * width * output_channel;
+  }
+  col_cpu.Resize({sum_num, 1});
+  out_cpu.Resize({out_sum_num, 1});
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+  float* col_cpu_data = col_cpu.mutable_data<float>();
+
+  X.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  W.Assign<float, lite::DDim, TARGET(kCUDA)>(w_cpu_data, w_cpu.dims());
+
+  param.X = &X;
+  param.W = &W;
+  // param.ROW = &ROW;
+  // param.COLUMN = &COLUMN;
+  param.Out = &Out;
+  param.Col = &Col;
+  param.stride_h = stride_h;
+  param.stride_w = stride_w;
+  param.kernel_h = kernel_h;
+  param.kernel_w = kernel_w;
+  param.input_channel = input_channel;
+  param.output_channel = output_channel;
+  var_conv_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  var_conv_kernel.SetContext(std::move(ctx));
+  var_conv_kernel.Run();
+  cudaDeviceSynchronize();
+
+  const float* out_data = Out.data<float>();
+  const float* col_data = Col.data<float>();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(
+      col_cpu_data, col_data, sizeof(float) * Col.numel(), IoDirection::DtoH);
+
+  lite::Tensor top_ref, col_ref;
+  var_conv_2d_ref(&x_cpu,
+                  &w_cpu,
+                  &ROW,
+                  &COLUMN,
+                  kernel_h,
+                  kernel_w,
+                  stride_h,
+                  stride_w,
+                  input_channel,
+                  output_channel,
+                  &top_ref,
+                  &col_ref);
+
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_data[i], top_ref.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < Col.numel(); ++i) {
+    EXPECT_NEAR(col_cpu_data[i], col_ref.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
index 3e06e103bba61937e48bb4d14eeedd493ab15bba..8bc171dd67df08c17cdce61c6fa6882afd9ae8ae 100644
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -36,8 +36,15 @@ void ConvCompute::PrepareForRun() {
   conv_param.filter = param.filter->ZynqTensor();
   conv_param.groups = param.groups;
   conv_param.strides = param.strides;
+  auto paddings = *param.paddings;
   conv_param.paddings = param.paddings;
   conv_param.dilations = param.dilations;
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+  if (!pad_equal) {
+    LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1]
+              << ", " << paddings[2] << ", " << paddings[3];
+  }
   fill_scale_bias_const(&conv_param);
   conv_param.bias()->copyFrom(param.bias->ZynqTensor());
   conv_param.relu.enabled = param.fuse_relu;
diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc
index f166974cc9f2fd856defd753e1e9131858d41252..1e05c1fa0c7e0f211b5eaed8f5e0385cbfe20cf2 100644
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
@@ -141,13 +141,15 @@ void conv_compute_ref(const operators::ConvParam& param) {
   int group = param.groups;
   int kernel_w = param.filter->dims()[2];
   int kernel_h = param.filter->dims()[3];
+
+  auto paddings = *param.paddings;
+  auto dilations = *para.dilations;
   int stride_w = param.strides[0];
   int stride_h = param.strides[1];
-  int dila_w = param.dilations[0];
-  int dila_h = param.dilations[1];
-
-  int pad_w = param.paddings[0];
-  int pad_h = param.paddings[1];
+  int dila_w = dilations[0];
+  int dila_h = dilations[1];
+  int pad_w = paddings[2];
+  int pad_h = paddings[0];
   bool flag_bias = (param.bias != nullptr);
   bool flag_relu = param.fuse_relu;
 
@@ -277,10 +279,14 @@ TEST(conv_fpga, compute) {
                             param.bias = &bias;
                           }
                           param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
+                          std::vector<int> paddings = {
+                              padding, padding, padding, padding};
                           param.strides = std::vector<int>({stride, stride});
+                          std::vector<int> dilations = {dilation, dilation};
+                          param.paddings =
+                              std::make_shared<std::vector<int>>(paddings);
                           param.dilations =
-                              std::vector<int>({dilation, dilation});
+                              std::make_shared<std::vector<int>>(dilations);
                           param.groups = group;
                           conv.SetParam(param);
                           conv.Launch();
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 032de819743f4aba02e442dd71c26b950d1435b6..79d1bf2fd5fa694d4888d474c321a43d279bab76 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -19,6 +19,9 @@ lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
 
 set(npu_bridges
         npu_bridge_registry
@@ -39,6 +42,9 @@ set(npu_bridges
         npu_bridge_concat_op
         npu_bridge_shuffle_channel_op
         npu_bridge_pad2d_op
+        npu_bridge_square_op
+        npu_bridge_sqrt_op
+        npu_bridge_reduce_mean_op
         CACHE INTERNAL "npu_bridges")
 
 set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
@@ -60,5 +66,8 @@ lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS
 lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 
 message(STATUS "+++++ npu_bridges: ${npu_bridges}")
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index 51b49091cd0e6f47fb9367e13aa7b2e43a6cf610..ac62891113b1899036c35ffd3058f1d409b00a36 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -41,6 +41,19 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
   // clipped_relu etc.
   act_node->set_attr_mode(lite::npu::CvtActMode(op_type));
 
+  if (op_type == "relu_clipped") {
+    auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    act_node->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    act_node->set_attr_negative_slope(alpha);
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    act_node->set_attr_negative_slope(slope);
+    act_node->set_attr_coef(offset);
+  }
+
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = act_node;
   return outputs_map;
@@ -52,14 +65,18 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(sigmod, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_NPU_BRIDGE(relu_clipped,
+                    paddle::lite::kernels::npu::bridges::ActConverter);
+// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_NPU_BRIDGE(leaky_relu,
+                    paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(softsign,
                     paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(softplus,
                     paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(hardsigmoid,
+REGISTER_NPU_BRIDGE(hard_sigmoid,
                     paddle::lite::kernels::npu::bridges::ActConverter);
diff --git a/lite/kernels/npu/bridges/act_op_test.cc b/lite/kernels/npu/bridges/act_op_test.cc
index 420de655dcdfb2069948399525bc4a8a561d0fd5..d50b1968b14cc33efd7ab9bcd0c4427d8ca2e508 100644
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/act_op_test.cc
@@ -17,7 +17,7 @@
 #include "lite/core/op_registry.h"
 #include "lite/kernels/npu/bridges/registry.h"
 #include "lite/kernels/npu/bridges/test_helper.h"
-#include "lite/operators/relu_op.h"
+#include "lite/operators/activation_ops.h"
 
 namespace paddle {
 namespace lite {
@@ -25,69 +25,112 @@ namespace kernels {
 namespace npu {
 namespace bridges {
 
-void relu_ref(const std::shared_ptr<operators::ReluOp> op) {
+void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto op_type = op_info->Type();
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
   auto x_data = x->data<float>();
   auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = std::max(0.f, x_data[i]);
+  CHECK_EQ(x->numel(), out->numel());
+
+  // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid"
+  if (op_type == "sigmoid") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(0.f, x_data[i]);
+    }
+  } else if (op_type == "tanh") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
+                    (std::exp(x_data[i]) + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu_clipped") {
+    auto relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
+    }
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(x_data[i], x_data[i] * alpha);
+    }
+  } else if (op_type == "softsign") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = x_data[i] / (1 + std::abs(x_data[i]));
+    }
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(1.f, slope * x_data[i] + offset);
+      out_data[i] = std::max(0.f, out_data[i]);
+    }
+  } else {
+    LOG(FATAL) << "unsupported activation type: " << op_type;
   }
 }
 
-void test_relu(int bs, int ic, int ih, int iw) {
+void test_act(std::vector<int64_t> x_shape, std::string op_type) {
   // prepare input&output variables
   Scope scope;
   std::string x_var_name("x");
   std::string out_var_name("out");
   std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(x_shape);
 
   // initialize input&output data
-  FillTensor<float, int>(x);
+  FillTensor<float>(x, -8, 8);
 
   // initialize op desc
   cpp::OpDesc opdesc;
-  opdesc.SetType("relu");
+  opdesc.SetType(op_type);
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetOutput("Out", {out_var_name});
+  if (op_type == "relu_clipped") {
+    opdesc.SetAttr("Relu_clipped_coef", 6.f);
+  } else if (op_type == "leaky_relu") {
+    opdesc.SetAttr("alpha", 0.02f);
+  } else if (op_type == "hard_sigmoid") {
+    opdesc.SetAttr("slope", 0.2f);
+    opdesc.SetAttr("offset", 0.5f);
+  }
 
   // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ReluOp>(opdesc, &scope);
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
   LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
 
   // execute reference implementation and save to output tensor
-  relu_ref(op);
+  act_ref(op);
 
   // compare results
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
-TEST(NPUBridges, relu) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {2, 5}) {
-        for (auto iw : {5, 9}) {
-          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                  << " iw: " << iw;
-          test_relu(bs, ic, ih, iw);
-        }
-      }
+TEST(NPUBridges, activation) {
+  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
+  std::vector<std::string> types{"sigmoid",
+                                 "relu",
+                                 "tanh",
+                                 "relu_clipped",
+                                 "leaky_relu",
+                                 "softsign",
+                                 "hard_sigmoid"};
+  for (auto x_shape : shapes) {
+    for (auto op_type : types) {
+      test_act(x_shape, op_type);
     }
   }
 }
@@ -98,5 +141,20 @@ TEST(NPUBridges, relu) {
 }  // namespace lite
 }  // namespace paddle
 
+USE_LITE_OP(sigmoid);
+USE_NPU_BRIDGE(sigmoid);
 USE_LITE_OP(relu);
 USE_NPU_BRIDGE(relu);
+USE_LITE_OP(tanh);
+USE_NPU_BRIDGE(tanh);
+USE_LITE_OP(relu_clipped);
+USE_NPU_BRIDGE(relu_clipped);
+
+USE_LITE_OP(leaky_relu);
+USE_NPU_BRIDGE(leaky_relu);
+
+USE_LITE_OP(softsign);
+USE_NPU_BRIDGE(softsign);
+
+USE_LITE_OP(hard_sigmoid);
+USE_NPU_BRIDGE(hard_sigmoid);
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index 6f5f00959bd55faee2a76aa0bfbb9f12fa84c194..8c3153d242330360a2145ae87951dc8ea29168ca 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -30,8 +30,8 @@ node_map_type BatchNormConverter(
   auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
-      std::make_shared<ge::op::BatchNorm>(unique_op_type);
+  std::shared_ptr<ge::op::BatchNormExt2> batch_norm_node =
+      std::make_shared<ge::op::BatchNormExt2>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
 
   auto scale_var_name = op_info->Input("Scale").front();
@@ -66,7 +66,7 @@ node_map_type BatchNormConverter(
 
   batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
   batch_norm_node->set_input_scale(*npu_scale);
-  batch_norm_node->set_input_b(*npu_bias);
+  batch_norm_node->set_input_offset(*npu_bias);
   batch_norm_node->set_input_mean(*npu_mean);
   batch_norm_node->set_input_variance(*npu_variance);
   batch_norm_node->set_attr_momentum(npu_momentum);
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 32f4d511d5d35a64a5e02a18a2b5ffa6d09d75cd..8dc9ab1f0f8a1e63c52b2406117fc34477e71490 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/conv_op.h"
 #include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -42,9 +43,9 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
   auto bs = input_dims[0];
   auto ic = input_dims[1];
   auto oc = filter_dims[0];
-  CHECK_EQ(input_dims.size(), 4);
-  CHECK_EQ(output_dims.size(), 4);
-  CHECK_EQ(filter_dims.size(), 4);
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
   CHECK_EQ(output_dims[0], bs);
   CHECK_EQ(output_dims[1], oc);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
@@ -52,9 +53,28 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
 
   // check depthwise mode, and decide whether use ConvolutionDepthwise Op
   bool use_depthwise_conv =
@@ -134,7 +154,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
     depthwise_conv_node->set_attr_pad_mode(5);  // VALID
     depthwise_conv_node->set_attr_group(groups);
     depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
     depthwise_conv_node->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
     depthwise_conv_node->set_attr_stride(
@@ -161,7 +181,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
     common_conv_node->set_attr_pad_mode(0);  // NOTSET
     common_conv_node->set_attr_group(groups);
     common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
+        {paddings[0], paddings[0], paddings[2], paddings[2]}));
     common_conv_node->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
     common_conv_node->set_attr_stride(
diff --git a/lite/kernels/npu/bridges/conv_op_test.cc b/lite/kernels/npu/bridges/conv_op_test.cc
index 26309aa9e27a1f0a5f6093b44242434d9e29a173..909061d2bae5f3330355c58f5dfe707a23c22075 100644
--- a/lite/kernels/npu/bridges/conv_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_op_test.cc
@@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
   int stride_h = strides[0];
   int dila_w = dilations[1];
   int dila_h = dilations[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int pad_h = paddings[0];
   int batch_size = input_dims[0];
   int in_ch_size = input_dims[1];
@@ -175,7 +175,8 @@ void test_conv(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 5ae99ef04670214c27f29b7ad30a637d614bea62..6eff4cb2d28d64098186dfb50a457a8828b8eb61 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -44,9 +44,17 @@ node_map_type ConvTransposeConverter(
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
 
   // create deconv node
   auto conv_transpose_node =
@@ -82,12 +90,11 @@ node_map_type ConvTransposeConverter(
   lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
 
   // set attributes
-  conv_transpose_node->set_attr_mode(1);
   conv_transpose_node->set_attr_format(0);    // NCHW
   conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
   conv_transpose_node->set_attr_group(groups);
   conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT(
-      {paddings[0], paddings[0], paddings[1], paddings[1]}));
+      {paddings[0], paddings[1], paddings[2], paddings[3]}));
   conv_transpose_node->set_attr_dilation(
       ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
   conv_transpose_node->set_attr_stride(
diff --git a/lite/kernels/npu/bridges/conv_transpose_op_test.cc b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
index a009ef588e1ddf9561f895e977fbb08a98b2d51b..f96e57c06fc0fe1023edd591990fe4bd7ffc3ba5 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
@@ -278,7 +278,8 @@ void test_conv_transpose(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index 2ec757ab14bf13eee323fa35df5ff592622ca4cf..5eb5f4e271df71b1fa29084f0787c004f4753ffc 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -21,6 +21,30 @@ namespace kernels {
 namespace npu {
 namespace bridges {
 
+std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
+  auto x_dims = x.dims();
+  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x";
+  auto y_dims = y->dims();
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
 node_map_type ElementwiseConverter(
     const std::shared_ptr<lite::OpLite> elementwise_op,
     const node_map_type& inputs_map) {
@@ -30,34 +54,53 @@ node_map_type ElementwiseConverter(
   auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Eltwise> elementwise_node =
-      std::make_shared<ge::op::Eltwise>(unique_op_type);
-
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
-
-  CHECK_EQ(op_info->GetAttr<int>("axis"), -1)
-      << "[NPU] elementwise only support inputs with same size";
-
   CHECK(inputs_map.find(x_var_name) != inputs_map.end());
-  elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto axis = op_info->GetAttr<int>("axis");
 
+  std::shared_ptr<ge::Operator> elementwise_node = nullptr;
+  std::shared_ptr<ge::Operator> x_node = inputs_map.at(x_var_name);
+  std::shared_ptr<ge::Operator> y_node = nullptr;
   if (inputs_map.find(y_var_name) != inputs_map.end()) {
-    elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
-    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
+    y_node = inputs_map.at(y_var_name);
   } else {
     auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
-    auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y));
-    elementwise_node->set_input_x2(*y_const_node);
-    lite::npu::OpList::Global().add(y_const_node);
+    auto x = scope->FindTensor(x_var_name);
+    auto y = scope->FindMutableTensor(y_var_name);
+    auto y_new_shape = CvtYShape(*x, y, axis);
+    y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape));
+    y_node = y_const_node;
   }
+  lite::npu::OpList::Global().add(x_node);
+  lite::npu::OpList::Global().add(y_node);
 
-  lite::npu::OpList::Global().add(elementwise_node);
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    auto elt_node = std::make_shared<ge::op::Add>(unique_op_type);
+    elt_node->set_input_x1(*x_node);
+    elt_node->set_input_x2(*y_node);
+    elementwise_node = elt_node;
+  } else if (op_type == "elementwise_sub") {
+    auto elt_node = std::make_shared<ge::op::Sub>(unique_op_type);
+    elt_node->set_input_x1(*x_node);
+    elt_node->set_input_x2(*y_node);
+    elementwise_node = elt_node;
+  } else if (op_type == "elementwise_mul") {
+    auto elt_node = std::make_shared<ge::op::Mul>(unique_op_type);
+    elt_node->set_input_x(*x_node);
+    elt_node->set_input_y(*y_node);
+    elementwise_node = elt_node;
+  } else if (op_type == "elementwise_div") {
+    auto elt_node = std::make_shared<ge::op::RealDiv>(unique_op_type);
+    elt_node->set_input_x1(*x_node);
+    elt_node->set_input_x2(*y_node);
+    elementwise_node = elt_node;
+  } else {
+    LOG(FATAL) << "unsupported op type: " << op_type;
+  }
 
-  // paddlelite has sum only
-  elementwise_node->set_attr_mode(1);
+  lite::npu::OpList::Global().add(elementwise_node);
 
   node_map_type outputs_map;
   if (op_type == "fusion_elementwise_add_activation") {
@@ -86,3 +129,9 @@ REGISTER_NPU_BRIDGE(elementwise_add,
                     paddle::lite::kernels::npu::bridges::ElementwiseConverter);
 REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation,
                     paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_NPU_BRIDGE(elementwise_sub,
+                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_NPU_BRIDGE(elementwise_mul,
+                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_NPU_BRIDGE(elementwise_div,
+                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
diff --git a/lite/kernels/npu/bridges/elementwise_ops_test.cc b/lite/kernels/npu/bridges/elementwise_ops_test.cc
index 0e2fc9f2622d839c8eda6f82aab2759053b3e23d..8dd4c851ca89413d3e740bb5bc5d0461938a7f69 100644
--- a/lite/kernels/npu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc
@@ -29,37 +29,28 @@ template <typename dtype>
 void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x = scope->FindTensor("x");
+  auto y = scope->FindTensor("y");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
 
   auto x_data = x->data<dtype>();
   auto y_data = y->data<dtype>();
-  dtype* out_data = out->mutable_data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   int axis = op_info->GetAttr<int>("axis");
 
   if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
+    axis += x_dims.size();
   }
   int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
+  int channels = y->numel();
+  int num = x->numel() / channels / batch;
   // do elementwise add/sub/max...
-  std::string elt_type = "add";
-  if (elt_type == "add") {
+  std::string op_type = op_info->Type();
+  if (op_type == "elementwise_add") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -73,7 +64,7 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "sub") {
+  } else if (op_type == "elementwise_sub") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -87,7 +78,7 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "mul") {
+  } else if (op_type == "elementwise_mul") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -101,7 +92,21 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "max") {
+  } else if (op_type == "elementwise_div") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr / diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_max") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -116,11 +121,14 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
       }
     }
   } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+    LOG(FATAL) << "unsupported Elementwise type: " << op_type;
   }
 }
 
-void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
+void test_elementwise_add(const std::vector<int64_t>& x_shape,
+                          const std::vector<int64_t>& y_shape,
+                          int axis,
+                          std::string elt_type) {
   // prepare input&output variables
   Scope scope;
   std::string x_var_name = "x";
@@ -131,16 +139,16 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
   auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
   auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
   auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  y->Resize({bs, ic, ih, iw});
+  x->Resize(x_shape);
+  y->Resize(y_shape);
 
   // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
+  FillTensor<float>(x, 1, 3);
+  FillTensor<float>(y, 1, 3);
 
   // initialize op desc
   cpp::OpDesc opdesc;
-  opdesc.SetType("elementwise_add");
+  opdesc.SetType("elementwise_" + elt_type);
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetInput("Y", {y_var_name});
   opdesc.SetOutput("Out", {out_var_name});
@@ -149,7 +157,6 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
   // create and convert op to NPU model, then run it on NPU
   auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
   LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
 
   // execute reference implementation and save to output tensor
   elementwise_add_ref<float>(op);
@@ -158,19 +165,15 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
 TEST(NPUBridges, elementwise_add) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis);
-        }
-      }
-    }
+  for (auto elt_type : {"add", "sub", "mul", "div"}) {
+    test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type);
+    test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type);
+    test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type);
   }
 }
 
@@ -182,3 +185,9 @@ TEST(NPUBridges, elementwise_add) {
 
 USE_LITE_OP(elementwise_add);
 USE_NPU_BRIDGE(elementwise_add);
+USE_LITE_OP(elementwise_sub);
+USE_NPU_BRIDGE(elementwise_sub);
+USE_LITE_OP(elementwise_mul);
+USE_NPU_BRIDGE(elementwise_mul);
+USE_LITE_OP(elementwise_div);
+USE_NPU_BRIDGE(elementwise_div);
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index 71f5eac57aa007b60ad574034b145b89b2e3095d..8e60a39fe4a32e8750cc161d3485314b42e1ab0c 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -45,6 +45,7 @@ node_map_type InterpolateConverter(
   auto out_h = op_info->GetAttr<int>("out_h");
   auto align_corners = op_info->GetAttr<bool>("align_corners");
   int align_mode = op_info->GetAttr<int>("align_mode");
+  auto interp_method = op_info->GetAttr<std::string>("interp_method");
   CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && "
                                                  "align_corners = false isn't "
                                                  "supported in HiAI DDK";
@@ -58,11 +59,11 @@ node_map_type InterpolateConverter(
   }
 
   // update out_h and out_w if has OutSize
-  bool inputs_map_has_w = false;
+  std::shared_ptr<ge::Operator> out_size_node = nullptr;
   if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
     auto out_size_var_name = op_info->Input("OutSize").front();
     if (inputs_map.count(out_size_var_name)) {
-      inputs_map_has_w = true;
+      out_size_node = inputs_map.at(out_size_var_name);
     } else {
       auto out_size =
           scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
@@ -73,58 +74,45 @@ node_map_type InterpolateConverter(
       out_w = out_size_data[1];
     }
   }
-
-  node_map_type outputs_map;
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  if (interp_method == "bilinear") {
-    auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    lite::npu::OpList::Global().add(interp_node);
-    interp_node->set_input_x(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_w(*inputs_map.at(out_size_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
+  if (out_size_node == nullptr) {
+    if (interp_method == "bilinear") {
       const float largest_multiple = 7.0f;
       float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
       CHECK_LT(multiple, largest_multiple)
           << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
           << " is too large, should not exceed " << largest_multiple
           << " in HiAI DDK";
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_w(*w_const_node);
-      lite::npu::OpList::Global().add(w_const_node);
     }
-    interp_node->set_attr_output_dim_mode(
-        2);  // 0: zoom_factor, 1: shrink_factor, 2: height/width
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
+    auto out_size_const_node =
+        std::make_shared<ge::op::Const>(unique_op_type + "/out_size");
+    out_size_const_node->set_attr_value(
+        lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
+    out_size_node = out_size_const_node;
+  }
+  lite::npu::OpList::Global().add(out_size_node);
+
+  std::shared_ptr<ge::Operator> interp_node = nullptr;
+  if (interp_method == "bilinear") {
+    auto bilinear_interp_node =
+        std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
+    bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name));
+    bilinear_interp_node->set_input_size(*out_size_node);
+    bilinear_interp_node->set_attr_align_corners(align_corners);
+    interp_node = bilinear_interp_node;
   } else if (interp_method == "nearest") {
-    auto interp_node =
+    auto nearest_interp_node =
         std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    lite::npu::OpList::Global().add(interp_node);
-    interp_node->set_input_image(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_size(*inputs_map.at(out_size_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_size(*w_const_node);
-      lite::npu::OpList::Global().add(w_const_node);
-    }
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
+    nearest_interp_node->set_input_image(*inputs_map.at(x_var_name));
+    nearest_interp_node->set_input_size(*out_size_node);
+    nearest_interp_node->set_attr_align_corners(align_corners);
+    interp_node = nearest_interp_node;
   } else {
     LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method;
   }
+  lite::npu::OpList::Global().add(interp_node);
 
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = interp_node;
   return outputs_map;
 }
 
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index 5f8bdc4ee955a15ca4795e9f2554182696f656f2..2313351f6c49ea08451b06dc347c91aeeed4d755 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -31,82 +31,67 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
   auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "[NPU] Converting " + op_type + "...";
 
-  auto output_node = std::make_shared<ge::op::MatMul>(unique_op_type);
-
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
   int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
   int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
-  auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto* ytensor = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
-
-  int m = xtensor->dims().Slice(0, x_num_col_dims).production();
-  int x_w = xtensor->dims()
-                .Slice(x_num_col_dims, xtensor->dims().size())
-                .production();
-  int y_h = ytensor->dims().Slice(0, y_num_col_dims).production();
-  int n = ytensor->dims()
-              .Slice(y_num_col_dims, ytensor->dims().size())
-              .production();
-  CHECK_EQ(x_w, y_h) << "[NPU] x_w must be equal with y_h";
-  int k = x_w;
+  int m = x_dims.Slice(0, x_num_col_dims).production();
+  int k = x_dims.Slice(x_num_col_dims, x_dims.size()).production();
+  CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
+      << "[NPU] columns of X must be equal with rows of Y";
+  int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
   LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
   LOG(INFO) << "x_var_name:" << x_var_name
             << ", is data: " << inputs_map.count(x_var_name);
   LOG(INFO) << "y_var_name:" << y_var_name
             << ", is data: " << inputs_map.count(y_var_name);
   CHECK(inputs_map.count(x_var_name))
-      << "[NPU] MatMul only support X is data, Y is const yet";
+      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
+
+  auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type);
+  // add input x node which supports persistable and non-persistable tensor, and
+  // reshape to (m, k)
   if (inputs_map.count(x_var_name)) {
-    auto xsrc = inputs_map.at(x_var_name);
-    auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-    reshapex->set_input_tensor(*xsrc);
-    reshapex->set_attr_shape({m, k});
-    reshapex->set_attr_axis(0);
-    lite::npu::OpList::Global().add(xsrc);
-    lite::npu::OpList::Global().add(reshapex);
-    output_node->set_input_x(*reshapex);
+    auto reshaped_x_node =
+        std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
+    reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+    reshaped_x_node->set_attr_shape({m, k});
+    reshaped_x_node->set_attr_axis(0);
+    mul_node->set_input_x1(*reshaped_x_node);
+    lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+    lite::npu::OpList::Global().add(reshaped_x_node);
   } else {
-    auto constx = std::make_shared<ge::op::Const>(x_var_name);
-    ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = desc.GetShape().GetShapeSize();
-    CHECK_EQ(size, xtensor->dims().production());
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    constx->set_attr_value(ptensor);
-    lite::npu::OpList::Global().add(constx);
-    output_node->set_input_x(*constx);
+    auto x_const_node = std::make_shared<ge::op::Const>(x_var_name);
+    x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
+    mul_node->set_input_x1(*x_const_node);
+    lite::npu::OpList::Global().add(x_const_node);
   }
-
+  // add input y node which only supports persistable tensor, and reshape to (k,
+  // n)
   if (inputs_map.count(y_var_name)) {
-    auto ysrc = inputs_map.at(y_var_name);
-    auto reshapey = std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
-    reshapey->set_input_tensor(*ysrc);
-    reshapey->set_attr_shape({k, n});
-    reshapey->set_attr_axis(0);
-    lite::npu::OpList::Global().add(ysrc);
-    lite::npu::OpList::Global().add(reshapey);
-    output_node->set_input_w(*reshapey);
+    auto reshaped_y_node =
+        std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
+    reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name));
+    reshaped_y_node->set_attr_shape({k, n});
+    reshaped_y_node->set_attr_axis(0);
+    mul_node->set_input_x2(*reshaped_y_node);
+    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
+    lite::npu::OpList::Global().add(reshaped_y_node);
   } else {
-    auto consty = std::make_shared<ge::op::Const>(y_var_name);
-    ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = desc.GetShape().GetShapeSize();
-    CHECK_EQ(size, ytensor->dims().production());
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    consty->set_attr_value(ptensor);
-    lite::npu::OpList::Global().add(consty);
-    output_node->set_input_w(*consty);
+    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
+    y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
+    mul_node->set_input_x2(*y_const_node);
+    lite::npu::OpList::Global().add(y_const_node);
   }
 
-  lite::npu::OpList::Global().add(output_node);
+  lite::npu::OpList::Global().add(mul_node);
 
   node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = mul_node;
   return outputs_map;
 }
 
diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
index 8b4252de06e8934affe7592fc8ea521ad7d20025..9a432d17e543bece48fb1c1369ee90ff56e8dcbf 100644
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
@@ -16,23 +16,40 @@
 
 #include "lite/kernels/npu/bridges/registry.h"
 
-USE_NPU_BRIDGE(mul);
-USE_NPU_BRIDGE(fc);
+USE_NPU_BRIDGE(sigmoid);
+USE_NPU_BRIDGE(relu);
+USE_NPU_BRIDGE(tanh);
+USE_NPU_BRIDGE(relu_clipped);
+USE_NPU_BRIDGE(leaky_relu);
+USE_NPU_BRIDGE(softsign);
+USE_NPU_BRIDGE(hard_sigmoid);
+
+USE_NPU_BRIDGE(batch_norm);
+USE_NPU_BRIDGE(concat);
 USE_NPU_BRIDGE(conv2d);
 USE_NPU_BRIDGE(depthwise_conv2d);
-USE_NPU_BRIDGE(pool2d);
-USE_NPU_BRIDGE(relu);
+USE_NPU_BRIDGE(conv2d_transpose);
+
 USE_NPU_BRIDGE(elementwise_add);
 USE_NPU_BRIDGE(fusion_elementwise_add_activation);
+USE_NPU_BRIDGE(elementwise_sub);
+USE_NPU_BRIDGE(elementwise_mul);
+USE_NPU_BRIDGE(elementwise_div);
+
+USE_NPU_BRIDGE(fc);
+USE_NPU_BRIDGE(bilinear_interp);
+USE_NPU_BRIDGE(nearest_interp);
+USE_NPU_BRIDGE(mul);
+USE_NPU_BRIDGE(pad2d);
+USE_NPU_BRIDGE(pool2d);
+USE_NPU_BRIDGE(reduce_mean);
+USE_NPU_BRIDGE(reshape);
+USE_NPU_BRIDGE(reshape2);
 USE_NPU_BRIDGE(scale);
+USE_NPU_BRIDGE(shuffle_channel);
 USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(concat);
 USE_NPU_BRIDGE(split);
+USE_NPU_BRIDGE(sqrt);
+USE_NPU_BRIDGE(square);
 USE_NPU_BRIDGE(transpose);
 USE_NPU_BRIDGE(transpose2);
-USE_NPU_BRIDGE(shuffle_channel);
-USE_NPU_BRIDGE(batch_norm);
-USE_NPU_BRIDGE(bilinear_interp);
-USE_NPU_BRIDGE(conv2d_transpose);
-USE_NPU_BRIDGE(reshape);
-USE_NPU_BRIDGE(reshape2);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index 5915b7a8aadfec38c1388177d726d6a33d612349..7bbe94d5db6b0345bb4a3fefe8a75f2a696902e9 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/pool_op.h"
 #include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -32,44 +33,78 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
   std::shared_ptr<ge::op::Pooling> pool_node =
       std::make_shared<ge::op::Pooling>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_var_name);
+  pool_node->set_input_x(*inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(pool_node);
+
+  int mode = 0;
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  int npu_mode = 0;
   if (pooling_type == "max") {
-    npu_mode = 0;
+    mode = 0;
   } else if (pooling_type == "avg") {
-    npu_mode = 1;
+    mode = 1;
     CHECK(op_info->GetAttr<bool>("exclusive"))
         << "[NPU] exclusive must be true in HiAI DDK";
   } else {
     LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type;
   }
-  bool npu_global_pooling = op_info->GetAttr<bool>("global_pooling");
+  pool_node->set_attr_mode(mode);
+
+  int pad_mode = 0;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = 6;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = 5;
+  }
+  pool_node->set_attr_pad_mode(pad_mode);
+
+  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
+  pool_node->set_attr_global_pooling(global_pooling);
+
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
+  auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
+  pool_node->set_attr_window(window);
 
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  auto npu_pad =
-      ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]};
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the inputs size.";
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  operators::UpdatePadding(&paddings,
+                           global_pooling,
+                           adaptive,
+                           padding_algorithm,
+                           x->dims(),
+                           strides,
+                           ksize);
+  auto npu_pad = ge::AttrValue::LIST_INT{
+      paddings[0], paddings[1], paddings[2], paddings[3]};
+  pool_node->set_attr_pad(npu_pad);
+
   auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  int npu_ceil_mode = 0;
+  pool_node->set_attr_stride(npu_stride);
+
+  int ceil_mode = 0;
   if (op_info->HasAttr("ceil_mode")) {
-    npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
   }
-
-  pool_node->set_input_x(*inputs_map.at(x_var_name));
-  pool_node->set_attr_mode(npu_mode);
-  pool_node->set_attr_pad_mode(0);
-  pool_node->set_attr_global_pooling(npu_global_pooling);
-  pool_node->set_attr_window(npu_window);
-  pool_node->set_attr_pad(npu_pad);
-  pool_node->set_attr_stride(npu_stride);
-  pool_node->set_attr_ceil_mode(npu_ceil_mode);
+  pool_node->set_attr_ceil_mode(ceil_mode);
   // output_node->set_attr_data_mode(npu_data_mode);
 
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(pool_node);
-
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = pool_node;
   return outputs_map;
diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc
index d4543a6ae128a0c534b216e42c6f3488a1dbfbf9..298e06554776e0f9efeade540d6498d1f71f8a16 100644
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ b/lite/kernels/npu/bridges/pool_op_test.cc
@@ -61,7 +61,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -163,7 +163,8 @@ void test_pool(int bs,
   opdesc.SetAttr("global_pooling", global_pooling);
   opdesc.SetAttr("exclusive", exclusive);
   opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
 
   // create and convert op to NPU model, then run it on NPU
   auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4725bdfb0e17c4f99dfd2359ff34c96f9e5af6e5
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type ReduceMeanConverter(
+    const std::shared_ptr<lite::OpLite> reduce_mean_op,
+    const node_map_type& inputs_map) {
+  auto scope = reduce_mean_op->scope();
+  auto op_info = reduce_mean_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  // get input, and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x_dims = scope->FindTensor(x_var_name)->dims();
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty.";
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (dim[i] < 0) {
+      dim[i] += x_dims.size();
+    }
+  }
+  std::sort(dim.begin(), dim.end());
+
+  // create reduce_mean(reduce_sum + scale) node and set input node from
+  // inputs_map
+  // creat reduce_sum node
+  auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum");
+  auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum);
+  CHECK(inputs_map.count(x_var_name));
+  reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(reduce_sum_node);
+
+  auto dim_const_node =
+      std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
+  dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
+  reduce_sum_node->set_input_w(*dim_const_node);
+  lite::npu::OpList::Global().add(dim_const_node);
+
+  reduce_sum_node->set_attr_keep_dims(keep_dim);
+
+  // create scale node
+  auto unique_scale = lite::npu::UniqueName("scale");
+  auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
+  scale_node->set_input_x(*reduce_sum_node);
+  lite::npu::OpList::Global().add(scale_node);
+
+  float scale = 1;
+  for (size_t i = 0; i < dim.size(); i++) {
+    scale /= x_dims[dim[i]];
+  }
+
+  std::vector<int64_t> scale_bias_shape = x_dims.Vectorize();
+  if (keep_dim) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      scale_bias_shape[dim[i]] = 1;
+    }
+  } else {
+    const int64_t kDelFlag = -2;
+    for (size_t i = 0; i < dim.size(); ++i) {
+      scale_bias_shape[dim[i]] = kDelFlag;
+    }
+    scale_bias_shape.erase(
+        remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag),
+        scale_bias_shape.end());
+  }
+
+  auto filter_const_node =
+      std::make_shared<ge::op::Const>(unique_scale + "/filter");
+  filter_const_node->set_attr_value(
+      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
+  scale_node->set_input_filter(*filter_const_node);
+  lite::npu::OpList::Global().add(filter_const_node);
+
+  scale_node->set_attr_axis(1);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = scale_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(reduce_mean,
+                    paddle::lite::kernels::npu::bridges::ReduceMeanConverter);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8646ce5c25b367cf3c9055f1ed13a225149a9cc7
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reduce_mean_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+void reduce_mean_n(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]) / num_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_c(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_h(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]) / height_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_w(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]) / width_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_all(const float* src,
+                     float* dst,
+                     int num_in,
+                     int channel_in,
+                     int height_in,
+                     int width_in) {
+  float mean = 0.0;
+  int src_index;
+  int n_id, c_id;
+  int all = num_in * channel_in * height_in * width_in;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          mean = src[src_index] / all;
+        }
+      }
+    }
+  }
+  dst[0] = mean;
+}
+
+void reduce_mean_nc(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_mean_ch(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_mean_hw(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+void reduce_mean_ref(const std::shared_ptr<operators::ReduceMeanOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto x_dims = x->dims();
+  auto x_data = x->data<float>();
+  auto out = scope->FindMutableTensor("out_ref");
+
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+
+  auto x_rank = x_dims.size();
+  if (!dim.empty()) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  bool reduce_all = false;
+  sort(dim.begin(), dim.end());
+  if (dim.size() == 0) {
+    reduce_all = true;
+  }
+
+  std::vector<int64_t> out_dims;
+  if (reduce_all) {
+    if (keep_dim) {
+      for (size_t i = 0; i < x_dims.size(); i++) {
+        out_dims.push_back(1);
+      }
+    } else {
+      out_dims.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < x_dims.size(); i++) {
+      out_dims.push_back(x_dims[i]);
+    }
+    if (keep_dim) {
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = 1L;
+      }
+    } else {
+      int64_t kDelFlag = -2;
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    out->Resize(DDim(out_dims));
+  }
+
+  auto out_data = out->mutable_data<float>();
+  int in_n = x_dims[0];
+  int in_c = x_dims[1];
+  int in_h = x_dims[2];
+  int in_w = x_dims[3];
+
+  if (dim.size() == 0) {
+    reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w);
+  } else if (dim.size() == 1) {
+    switch (dim[0]) {
+      case 0:
+        reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 1:
+        reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 2:
+        reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 3:
+        reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      default:
+        LOG(FATAL) << "error!!!";
+    }
+  } else if (dim.size() == 2) {
+    if (dim[0] == 0 && dim[1] == 1) {
+      reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 1 && dim[1] == 2) {
+      reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 2 && dim[1] == 3) {
+      reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else {
+      LOG(FATAL) << "invalid dim!!";
+    }
+  }
+}
+
+void test_reduce_mean(const std::vector<int64_t>& input_shape,
+                      std::vector<int> dim,
+                      bool keep_dim) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("reduce_mean");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("dim", dim);
+  opdesc.SetAttr("keep_dim", keep_dim);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ReduceMeanOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  reduce_mean_ref(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, reduce_mean) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto dim : reduce_dim) {
+    for (auto keep_dim : {true, false}) {
+      test_reduce_mean({1, 2, 3, 4}, dim, keep_dim);
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(reduce_mean);
+USE_NPU_BRIDGE(reduce_mean);
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index b2ed556faf543cca138dad1cb773225202fbaca5..a554aac94f270517d26ed76016678989b87b6ea6 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -41,8 +41,10 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
   reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
   lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
 
-  // read shape from actual shape tensor as input "w" if 'Shape' is found
-  if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
+  // read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
+  if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) {
+    LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor.";
+  } else if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
     auto actual_shape_var_name = op_info->Input("Shape").front();
     if (!inputs_map.count(actual_shape_var_name)) {
       auto actual_shape =
diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84ab3a9eb2db7420a7dd193e1c1cc6c32a362e55
--- /dev/null
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type SqrtConverter(const std::shared_ptr<lite::OpLite> sqrt_op,
+                            const node_map_type& inputs_map) {
+  auto scope = sqrt_op->scope();
+  auto op_info = sqrt_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  std::shared_ptr<ge::op::Sqrt> sqrt_node =
+      std::make_shared<ge::op::Sqrt>(unique_op_type);
+
+  auto x_var_name = op_info->Input("X").front();
+
+  CHECK(inputs_map.count(x_var_name));
+  sqrt_node->set_input_x(*inputs_map.at(x_var_name));
+
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(sqrt_node);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = sqrt_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter);
diff --git a/lite/kernels/npu/bridges/sqrt_op_test.cc b/lite/kernels/npu/bridges/sqrt_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..015d61685b2d99c3df55269442d61b4a137a2ca3
--- /dev/null
+++ b/lite/kernels/npu/bridges/sqrt_op_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void sqrt_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  for (size_t i = 0; i < x->numel(); i++) {
+    out_data[i] = std::sqrtf(x_data[i]);
+  }
+}
+
+void test_sqrt(const std::vector<int64_t>& input_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x, 0, 5);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("sqrt");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  sqrt_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, sqrt) {
+  test_sqrt({2});
+  test_sqrt({2, 3});
+  test_sqrt({1, 2, 3, 4});
+  test_sqrt({5, 6, 7, 8});
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(sqrt);
+USE_NPU_BRIDGE(sqrt);
diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ca91adba0a8b24e6559599cb5952f8b47722ba3
--- /dev/null
+++ b/lite/kernels/npu/bridges/square_op.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type SquareConverter(const std::shared_ptr<lite::OpLite> square_op,
+                              const node_map_type& inputs_map) {
+  auto scope = square_op->scope();
+  auto op_info = square_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  std::shared_ptr<ge::op::Square> square_node =
+      std::make_shared<ge::op::Square>(unique_op_type);
+
+  auto x_var_name = op_info->Input("X").front();
+
+  CHECK(inputs_map.count(x_var_name));
+  square_node->set_input_x(*inputs_map.at(x_var_name));
+
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(square_node);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = square_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(square,
+                    paddle::lite::kernels::npu::bridges::SquareConverter);
diff --git a/lite/kernels/npu/bridges/square_op_test.cc b/lite/kernels/npu/bridges/square_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d715c11430096a0b6503fbe6047a40c3c29ba8f5
--- /dev/null
+++ b/lite/kernels/npu/bridges/square_op_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void square_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  for (size_t i = 0; i < x->numel(); i++) {
+    out_data[i] = x_data[i] * x_data[i];
+  }
+}
+
+void test_square(const std::vector<int64_t>& input_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("square");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  square_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, square) {
+  test_square({2});
+  test_square({2, 3});
+  test_square({1, 2, 3, 4});
+  test_square({5, 6, 7, 8});
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(square);
+USE_NPU_BRIDGE(square);
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index d070eb84c5313e7539f28da0a90dcc3662be01a1..99b23c19f0f5870102782f0b4d639f6103257c31 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (NOT LITE_WITH_OPENCL)
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
     return ()
 endif()
 
diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc
index 04a78face2b9c07c42aceb53f0f797ded46e59d9..e13d12ec224c4ececf53c55c8acb1f1b0e483801 100644
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -38,15 +38,20 @@ void ConvCompute::PrepareForRun() {
   int w_out = output_dims[3];
   int kernel_h = filter_dims[2];  // oihw
   int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int groups = param.groups;
   bool relu_fused = param.fuse_relu;
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool zero_pad = (pad_h == 0) && (pad_w == 0);
 
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
   VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
   VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
           << " stride_w:" << stride_w << " pad_h:" << pad_h
@@ -60,7 +65,7 @@ void ConvCompute::PrepareForRun() {
           << filter_dims[2] << " " << filter_dims[3];
 
   if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 &&
-      zero_pad && no_dilation) {
+      zero_pad && no_dilation && pad_equal) {
     // conv2d_1x1
     kernel_func_names_.push_back("gemm_batch");
     kernel_func_paths_.push_back("buffer/fc_kernel.cl");
@@ -70,7 +75,7 @@ void ConvCompute::PrepareForRun() {
       build_options_.push_back("-DCL_DTYPE=float");
     }
     impl_ = &ConvCompute::Conv2d1x1;
-  } else {
+  } else if (pad_equal) {
     kernel_func_names_.push_back("im2col");
     kernel_func_names_.push_back("gemm_batch");
     kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
@@ -85,6 +90,9 @@ void ConvCompute::PrepareForRun() {
     col_buffer_.reset(new lite::Tensor);
     col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out});
     col_buffer_->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  } else {
+    LOG(FATAL) << "This pad not support ! " << paddings[0] << ", "
+               << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
   }
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
@@ -102,17 +110,19 @@ void ConvCompute::GemmlikeConv2d() {
   int c_in = x_dims[1];
   int h_in = x_dims[2];
   int w_in = x_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int c_out = output_dims[1];
   int h_out = output_dims[2];
   int w_out = output_dims[3];
   int kernel_h = filter_dims[2];
   int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
-  int dilation_h = param.dilations[0];
-  int dilation_w = param.dilations[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
 
   auto* x_buf = param.x->data<float, cl::Buffer>();
   auto* filter_buf = param.filter->data<float, cl::Buffer>();
diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc
index a7417e3525605e208c8e25cd5d34200e6652053d..3bc7a0734db0314f911981027ceeef02fcbf96c7 100644
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
@@ -24,7 +24,6 @@ namespace lite {
 #define A(i, j) a[i * lda + j]
 #define B(i, j) cur_b[i * ldb + j]
 #define C(i, j) cur_c[i * ldc + j]
-
 template <typename Dtype1, typename Dtype2>
 static void conv_basic(const Dtype1* din,
                        Dtype2* dout,
@@ -227,10 +226,12 @@ TEST(conv2d, compute_conv2d_1x1) {
                 param.bias = bias_flag ? &bias : nullptr;
                 param.output = &out;
                 param.strides = {stride, stride};
-                param.paddings = {pad, pad};
+                std::vector<int> paddings = {pad, pad, pad, pad};
                 param.groups = group;
-                param.dilations = {dilation, dilation};
+                std::vector<int> dilations = {dilation, dilation};
                 param.fuse_relu = relu_flag;
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
 
                 kernel->SetParam(param);
                 std::unique_ptr<KernelContext> conv_context(new KernelContext);
@@ -454,11 +455,14 @@ TEST(conv2d, compute_conv2d_gemm) {
                 param.bias = bias_flag ? &bias : nullptr;
                 param.output = &out;
                 param.strides = {stride, stride};
-                param.paddings = {pad, pad};
+                std::vector<int> paddings = {pad, pad, pad, pad};
                 param.groups = group;
-                param.dilations = {dilation, dilation};
+                std::vector<int> dilations = {dilation, dilation};
                 param.fuse_relu = relu_flag;
 
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
+
                 kernel->SetParam(param);
                 std::unique_ptr<KernelContext> conv_context(new KernelContext);
                 context->As<OpenCLContext>().CopySharedTo(
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc
index 62734610e280c89f9df2e367fd7251c7d25756e7..ed942d7f0cb7b0bab119f258fb6393b9dbd211a6 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -44,7 +44,7 @@ class DepthwiseConv2dCompute
     auto x_dims = param.x->dims();
     auto filter_dims = param.filter->dims();
     auto output_dims = param.output->dims();
-    auto paddings = param.paddings;
+    auto paddings = *param.paddings;
     auto strides = param.strides;
 
     auto& context = ctx_->As<OpenCLContext>();
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
index a189acaf919e605b4810770e7136d00baeea4bfa..3556d1abedd5b4548b78b90b75de2ee86572fdb7 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -105,7 +105,8 @@ TEST(depthwise_conv2d, compute) {
   param.x = &input;
   param.filter = &filter;
   param.output = &output;
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
   param.strides = std::vector<int>{1, 1};
 
   std::unique_ptr<KernelContext> context(new KernelContext);
diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_compute.cc
index dc4bdfe64c65f21e8f68a26df3e2962087f50bef..3387a0887d3422636e39e742149f84672e8e75d4 100644
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
@@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute
     auto* wait_list = context.cl_wait_list();
     auto* x_ptr = param.x->data<float, cl::Buffer>();
 
-    /* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    in kernel and enable wait_list
+    /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
+    `cl_wait_list`
+    in kernel and `wait_list` enabled
     auto it = wait_list->find(x_ptr);
     if (it != wait_list->end()) {
       VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc
index dc2e851595b08e1ff401499502fab64df4dfa46f..d275b312d67b5aba7050a195949ee4c3792b5da7 100644
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -44,16 +44,22 @@ class PoolCompute
     const auto& out_dims = param.output->dims();
     const std::string pooling_type = param.pooling_type;
     const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = param.paddings;
+    std::vector<int> paddings = *param.paddings;
     std::vector<int> strides = param.strides;
     std::vector<int> ksize = param.ksize;
     if (global_pooling) {
       for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
         ksize[i] = static_cast<int>(in_dims[i + 2]);
       }
     }
-
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     auto* input_buf = param.x->data<float, cl::Buffer>();
@@ -89,7 +95,7 @@ class PoolCompute
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[1]));
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, *output_buf);
     CL_CHECK_FATAL(status);
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc
index 53f64e950500425655fbd450d5961a2a8dbc412d..25f0e72634775f4c5e82a6bd800f9ca980da2e34 100644
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <memory>
 #include <random>
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
@@ -88,9 +89,10 @@ TEST(pool2d, compute) {
   param.output = &out;
   param.global_pooling = true;
   param.pooling_type = "avg";
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
   param.strides = std::vector<int>{1, 1};
   param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
 
   std::unique_ptr<KernelContext> context(new KernelContext);
   context->As<OpenCLContext>().InitOnce();
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index da955e4fd5902373cd881f85a8bc715eef7cec94..bf3a1685f028740da1b7f4dfa38f19b73d30df89 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -5,6 +5,7 @@ add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${li
 
 # lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(cast_compute_x86 X86 basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
 add_kernel(slice_compute_x86 X86 basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(squeeze_compute_x86 X86 basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fill_constant_batch_size_like_compute_x86 X86 basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_function)
@@ -15,8 +16,10 @@ add_kernel(conv_compute_x86 X86 basic SRCS conv_compute.cc DEPS ${lite_kernel_de
 # lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
 # lite_cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
 add_kernel(pool_compute_x86 X86 basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
+add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function)
+add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
 # add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
@@ -26,6 +29,7 @@ add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_comp
 
 # lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
@@ -33,12 +37,27 @@ add_kernel(mul_compute_x86 X86 basic SRCS mul_compute.cc DEPS ${lite_kernel_deps
 add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling)
+add_kernel(search_group_padding_compute_x86 X86 basic SRCS search_group_padding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_reverse_compute_x86 X86 basic SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
 add_kernel(elementwise_compute_x86 X86 basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(batch_norm_compute_x86 X86 basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reduce_sum_compute_x86 X86 basic SRCS reduce_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_x86 X86 basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_reshape_compute_x86 X86 basic SRCS sequence_reshape_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(match_matrix_tensor_compute_x86 X86 basic SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps} blas math_function)
+add_kernel(search_seq_depadding_compute_x86 X86 basic SRCS search_seq_depadding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(search_grnn_compute_x86 X86 basic SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps} blas math_function)
+add_kernel(sequence_concat_compute_x86 X86 basic SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(var_conv_2d_compute_x86 X86 basic SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps} blas fluid_data_type)
+add_kernel(attention_padding_mask_compute_x86 X86 basic SRCS attention_padding_mask_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_arithmetic_compute_x86 X86 basic SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps})
+
+# for content-dnn specific
+add_kernel(search_aligned_mat_mul_compute_x86 X86 extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} blas)
+add_kernel(search_seq_fc_compute_x86 X86 extra SRCS search_seq_fc_compute.cc DEPS ${lite_kernel_deps} blas)
+add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps} sequence_topk_avg_pooling)
+add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc)
 
 if(NOT LITE_WITH_X86)
     return()
@@ -47,12 +66,14 @@ add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kerne
 
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86)
+lite_cc_test(test_gather_compute_x86 SRCS gather_compute_test.cc DEPS gather_compute_x86)
 lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86)
 lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86)
 lite_cc_test(test_fill_constant_batch_size_like_compute_x86 SRCS fill_constant_batch_size_like_compute_test.cc DEPS fill_constant_batch_size_like_compute_x86)
 lite_cc_test(test_reshape_compute_x86 SRCS reshape_compute_test.cc DEPS reshape_compute_x86)
 lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
 lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86)
+lite_cc_test(test_sequence_reverse_compute_x86 SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_x86)
 lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86)
 lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
 lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
@@ -63,7 +84,19 @@ lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_com
 lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86)
 lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86)
 lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86)
-
+lite_cc_test(test_cast_compute_x86 SRCS cast_compute_test.cc DEPS cast_compute_x86)
 lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
+lite_cc_test(test_layer_norm_compute_x86 SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_x86)
 lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 lite_cc_test(test_transpose_compute_x86 SRCS transpose_compute_test.cc DEPS transpose_compute_x86)
+lite_cc_test(test_search_fc_compute_x86 SRCS search_fc_compute_test.cc DEPS search_fc_compute_x86)
+lite_cc_test(test_search_seq_depadding_compute_x86 SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_x86)
+lite_cc_test(test_search_grnn_compute_x86 SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_x86)
+lite_cc_test(test_match_matrix_compute_x86 SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_x86)
+lite_cc_test(test_lookup_table_compute_x86 SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_x86)
+lite_cc_test(test_stack_compute_x86 SRCS stack_compute_test.cc DEPS stack_compute_x86)
+lite_cc_test(test_search_group_padding_compute_x86 SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_x86)
+lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_x86)
+lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86)
+#lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
+lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86)
diff --git a/lite/kernels/x86/attention_padding_mask_compute.cc b/lite/kernels/x86/attention_padding_mask_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c35c416e7771f7896c5378ec8c0199b91ffd685
--- /dev/null
+++ b/lite/kernels/x86/attention_padding_mask_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/attention_padding_mask_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_attention_padding_mask,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::AttentionPaddingMaskCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9124e5ad49a0d68c41a21fe55d28102f09d14b9
--- /dev/null
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <random>
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/eigen.h"
+#include "lite/operators/attention_padding_mask_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class AttentionPaddingMaskCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AttentionPaddingMaskParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto* bottom0 = param.X;
+    auto* bottom1 = param.Y;
+    auto* _pad_begin = param.pad_begin;
+    auto* top = param.Out;
+    int _pad_id = param.pad_id;
+    float _mask = param.mask;
+    auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
+    const int att_batch = bottom0->lod()[0].size() - 1;
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    int* pad_begin = _pad_begin->mutable_data<int>();
+    for (int i = 0; i < src_batch; ++i) {
+      const auto* src_data = bottom1->data<T>() + src_len * i;
+      int index = src_len - 1;
+      for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+           --index) {
+      }
+      pad_begin[i] = index + 1;
+    }
+
+    const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
+    auto* top_data = top->mutable_data<T>();
+    memcpy(top_data,
+           bottom0->data<T>(),
+           bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
+    for (int i = 0; i < att_batch; ++i) {
+      for (int j = 0; j < att_len; ++j) {
+        top_data = top->mutable_data<T>() + src_len * (att_len * i + j);
+        int src_idx = i % src_batch;
+        for (int k = pad_begin[src_idx]; k < src_len; ++k) {
+          top_data[k] = _mask;
+        }
+      }
+    }
+  }
+
+  virtual ~AttentionPaddingMaskCompute() = default;
+
+ private:
+  lite::Tensor src_offset_;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35ce822e010fc3ce2dc756b86e3a437789cc8359
--- /dev/null
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/attention_padding_mask_compute.cc"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void attention_padding_mask_ref(
+    const Tensor& x,
+    const Tensor& y,
+    Tensor* out,
+    Tensor* pad_begin,
+    const operators::AttentionPaddingMaskParam& param) {
+  auto attn_offset = x.lod()[0];
+  auto src_offset = y.lod()[0];
+  int attn_seq_num = attn_offset.size() - 1;
+  int src_seq_num = src_offset.size() - 1;
+  int attn_seq_len = attn_offset[1];
+  int src_seq_len = x.dims()[1];
+  CHECK_EQ(attn_seq_num % src_seq_num, 0);
+
+  auto count = x.numel();
+  auto attn_data = x.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  memcpy(out_data, attn_data, count * sizeof(float));
+
+  for (int i = 0; i < attn_seq_num; ++i) {
+    for (int j = 0; j < attn_seq_len; ++j) {
+      auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
+      int src_seq_idx = i % src_seq_num;
+      int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
+      for (int k = cur_len; k < src_seq_len; k++) {
+        tmp_out_data[k] = param.mask;
+      }
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
+  std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
+  x->Resize(x_dims);
+  x->set_lod(lod);
+  auto x_data = x->mutable_data<float>();
+  auto x_num = x->numel();
+  for (int i = 0; i < x_num; i++) {
+    x_data[i] = (i - x_num) * 1.1;
+  }
+}
+
+int get_max_len(const LoD& lod) {
+  int max_len = 0;
+  auto offset = lod[0];
+  for (int i = 0; i < offset.size() - 1; i++) {
+    int cur_len = offset[i + 1] - offset[i];
+    max_len = max_len < cur_len ? cur_len : max_len;
+  }
+  return max_len;
+}
+
+TEST(attention_padding_mask_x86, retrive_op) {
+  auto attention_padding_mask =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "attention_padding_mask");
+  ASSERT_FALSE(attention_padding_mask.empty());
+  ASSERT_TRUE(attention_padding_mask.front());
+}
+
+TEST(attention_padding_mask_x86, init) {
+  AttentionPaddingMaskCompute<float> attention_padding_mask;
+  ASSERT_EQ(attention_padding_mask.precision(), PRECISION(kFloat));
+  ASSERT_EQ(attention_padding_mask.target(), TARGET(kX86));
+}
+
+TEST(attention_padding_mask_x86, run_test) {
+  lite::Tensor x, y;
+  lite::Tensor out, pad_begin, out_ref, pad_begin_ref;
+
+  LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
+  prepare_input(&x, x_lod, get_max_len(y_lod));
+  prepare_input(&y, y_lod, 1);
+
+  operators::AttentionPaddingMaskParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.pad_id = 12800001;
+  param.mask = -90000000.f;
+  param.Out = &out;
+  param.pad_begin = &pad_begin;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  AttentionPaddingMaskCompute<float> attention_padding_mask_kernel;
+  attention_padding_mask_kernel.SetParam(param);
+  attention_padding_mask_kernel.SetContext(std::move(ctx));
+  attention_padding_mask_kernel.Run();
+
+  attention_padding_mask_ref(x, y, &out_ref, &pad_begin_ref, param);
+  auto out_data = out.data<float>();
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_attention_padding_mask, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d342056c7f19e9eba0fe16196d772da6bd5fda3c
--- /dev/null
+++ b/lite/kernels/x86/cast_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/cast_compute.h"
+
+REGISTER_LITE_KERNEL(cast,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::CastCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/cast_compute.h b/lite/kernels/x86/cast_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..06e47e9a5023ea149510e8f10bf719cd6a854349
--- /dev/null
+++ b/lite/kernels/x86/cast_compute.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/data_type.h"
+#include "lite/fluid/hostdevice.h"
+#include "lite/fluid/transform.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <lite::TargetType Target, typename InT>
+class CastOpFunctor {
+ public:
+  CastOpFunctor(const lite::Tensor* in,
+                lite::Tensor* out,
+                const lite::Context<Target>& context)
+      : input(in), output(out), ctx(context) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* in_begin = input->data<InT>();
+    auto numel = input->dims().production();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = output->mutable_data<OutT>();
+    paddle::lite::fluid::Transform<lite::TargetType::kX86> trans;
+    trans(
+        ctx, in_begin, in_end, out_begin, CastOpTransformFunctor<InT, OutT>());
+  }
+
+ private:
+  const lite::Tensor* input;
+  lite::Tensor* output;
+  const lite::Context<Target>& ctx;
+};
+
+template <typename InT>
+class CastCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::CastParam;
+
+  void Run() override {
+    auto param = param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    auto x = param->X;
+    auto out = param->Out;
+    auto out_dtype = param->out_dtype;
+    paddle::lite::fluid::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype),
+        CastOpFunctor<lite::TargetType::kX86, InT>(x, out, context));
+  }
+  virtual ~CastCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7aa52ca6d0dde603357f009220b4a3a53f56833
--- /dev/null
+++ b/lite/kernels/x86/cast_compute_test.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/cast_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(cast_x86, retrive_op) {
+  auto cast =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("cast");
+  ASSERT_FALSE(cast.empty());
+  ASSERT_TRUE(cast.front());
+}
+
+TEST(cast_x86, init) {
+  CastCompute<float> cast;
+  ASSERT_EQ(cast.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cast.target(), TARGET(kX86));
+}
+
+TEST(cast_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 1, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> out_shape{batch_size, 1, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<int32_t>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(1);
+  }
+
+  CastCompute<float> cast;
+  operators::CastParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  cast.SetContext(std::move(ctx));
+  cast.SetParam(param);
+  cast.Run();
+
+  std::vector<int32_t> ref_results = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(cast, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h
index 48cb3c74ef3c05675115ab7cec09f16322d1410a..e9f403059f90cf6635bc22db3e6890b86cbe85f6 100644
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
@@ -67,7 +67,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     lite::DDim col_shape(col_shape_vec);
     lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim + 1);
     bool is_expand = IsExpand(
-        filter_shape_vec, param.strides, param.paddings, param.dilations);
+        filter_shape_vec, param.strides, *param.paddings, *param.dilations);
     lite::Tensor col;
     lite::Tensor col_matrix;
     if (is_expand) {
@@ -95,20 +95,15 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto blas =
         paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch;
-      lite::Tensor tmp_in_batch = param.x->Slice<T>(i, i + 1);
-      tmp_in_batch.Resize(input_shape);
-      in_batch.ShareDataWith(tmp_in_batch);
-      lite::Tensor out_batch;
-      lite::Tensor tmp_out_batch = param.output->Slice<T>(i, i + 1);
-      tmp_out_batch.Resize(output_matrix_shape);
-      out_batch.ShareDataWith(tmp_out_batch);
+      lite::Tensor in_batch = param.x->Slice<T>(i, i + 1);
+      in_batch.Resize(input_shape);
+      lite::Tensor out_batch = param.output->Slice<T>(i, i + 1);
+      out_batch.Resize(output_matrix_shape);
       for (int g = 0; g < param.groups; g++) {
-        lite::Tensor in_slice;
-        in_slice.ShareDataWith(
+        lite::Tensor in_slice =
             in_batch.Slice<T>(static_cast<int64_t>(g * in_step),
-                              static_cast<int64_t>((g + 1) * in_step)));
-
+                              static_cast<int64_t>((g + 1) * in_step));
+        auto paddings = *param.paddings;
         if (!is_expand) {
           col.ShareDataWith(in_slice);
           col_matrix.ShareDataWith(col);
@@ -117,32 +112,30 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
           // im2col
           im2col(context,
                  in_slice,
-                 param.dilations,
+                 *param.dilations,
                  param.strides,
-                 std::vector<int>{param.paddings[0],
-                                  param.paddings[1],
-                                  param.paddings[0],
-                                  param.paddings[1]},
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[0], paddings[2]},
                  &(col));
         } else if (data_dim == 3U) {
           // vol2col
           vol2col(context,
                   in_slice,
-                  param.dilations,
+                  *param.dilations,
                   param.strides,
-                  param.paddings,
+                  *param.paddings,
                   &(col));
         }
 
         // gemm
         lite::Tensor out_slice;
-        out_slice.ShareDataWith(
+        out_slice =
             out_batch.Slice<T>(static_cast<int64_t>(g * out_step),
-                               static_cast<int64_t>((g + 1) * out_step)));
+                               static_cast<int64_t>((g + 1) * out_step));
         lite::Tensor filter_slice;
-        filter_slice.ShareDataWith(
+        filter_slice =
             filter.Slice<T>(static_cast<int64_t>(g * out_step),
-                            static_cast<int64_t>((g + 1) * out_step)));
+                            static_cast<int64_t>((g + 1) * out_step));
         blas.MatMul(filter_slice,
                     false,
                     col_matrix,
diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc
index f2dde962b9e77ce26336d17f07f29f5874ef9722..2827c6577e5bf311b4002526d4ac10f636162d96 100644
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
@@ -73,9 +73,11 @@ TEST(conv2d_x86, run_test) {
   param.bias = &b;
   param.output = &out;
   param.strides = {1, 1};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
   param.groups = 1;
-  param.dilations = {1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   LOG(INFO) << 123;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   ctx->As<X86Context>();
diff --git a/lite/kernels/x86/fill_constant_compute.cc b/lite/kernels/x86/fill_constant_compute.cc
index 1eb76332ccc21b0c5196d71b9246ed8b144a6593..dace1e90258a93aa5c8e89d1d9369adf39416659 100644
--- a/lite/kernels/x86/fill_constant_compute.cc
+++ b/lite/kernels/x86/fill_constant_compute.cc
@@ -29,6 +29,38 @@ class FillConstantCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
   using param_t = operators::FillConstantParam;
 
+  inline DDimLite GetShape(const param_t& param) {
+    // 1. shape is a Tensor
+    if (param.shape_tensor != nullptr) {
+      auto* shape_tensor = param.shape_tensor;
+      auto* shape_data = shape_tensor->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
+      return DDimLite(vec_shape);
+    }
+
+    // 2. shape is a list/tuple containing Tensor
+    auto shape_tensor_list = param.shape_tensor_list;
+    if (shape_tensor_list.size() > 0) {
+      std::vector<int64_t> vec_shape;
+      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
+        auto tensor = shape_tensor_list[i];
+        vec_shape.push_back(*tensor->data<int>());
+      }
+      return DDimLite(vec_shape);
+    }
+
+    // 3. shape is a list/tuple without containing Tensor
+    auto vec_shape = param.shape;
+    return DDimLite(vec_shape);
+  }
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto outdims = GetShape(param);
+    param.Out->Resize(outdims);
+  }
+
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<X86Context>();
@@ -55,5 +87,9 @@ REGISTER_LITE_KERNEL(fill_constant,
                      kNCHW,
                      paddle::lite::kernels::x86::FillConstantCompute<float>,
                      def)
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("ShapeTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/gather_compute.cc b/lite/kernels/x86/gather_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..836f336271ef53c338cca89855b48c94c778cc54
--- /dev/null
+++ b/lite/kernels/x86/gather_compute.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gather_compute.h"
+
+typedef paddle::lite::kernels::x86::GatherCompute<float, int32_t> GatherInt32;
+typedef paddle::lite::kernels::x86::GatherCompute<float, int64_t> GatherInt64;
+
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt64, int64_in)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ee270647f8fb7d7ec540047cd4d546a7eb89ce8
--- /dev/null
+++ b/lite/kernels/x86/gather_compute.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+/**
+ * A thin wrapper for gathering on cpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-IndexT index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T, typename IndexT = int>
+void CPUGather(const lite::Tensor* src,
+               const lite::Tensor* index,
+               lite::Tensor* output) {
+  // check index of shape 1-D
+  if (index->dims().size() == 2) {
+    CHECK(index->dims()[1] == 1) << "Index(Input)'s dimension[1] should be 1 "
+                                    "when Index(input)'s dimension's size "
+                                    "equal to 2 in Gather(Op).";
+  } else {
+    CHECK(index->dims().size() == 1)
+        << "Index(Input)'s dimension's size() should be 1 or 2 in Gather(Op).";
+  }
+  int64_t index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+
+  const T* p_src = src->data<T>();
+  const IndexT* p_index = index->data<IndexT>();
+  T* p_output = output->mutable_data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+  for (int64_t i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+
+template <typename T, typename IndexT>
+class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GatherParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+
+    auto x = param.X;
+    auto index = param.Index;
+    auto out = param.Out;
+
+    out->mutable_data<T>();
+    if (x->dims().production() == 0) return;
+    /*
+     * Since there's no type defined for lite::Tensor in Paddle-Lite, then
+     * convert the Index's value to float which must be int32_t or int64_t and
+     * this supposes to cause no precision difference during inference just for
+     * now.
+     * Alternatively, if define the Tensor's type during registering, may cause
+     * a redefinition error.
+     */
+    CPUGather<T, IndexT>(x, index, out);
+  }
+
+  virtual ~GatherCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19
--- /dev/null
+++ b/lite/kernels/x86/gather_compute_test.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gather_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(gather_x86, retrive_op) {
+  auto gather =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "gather");
+  ASSERT_FALSE(gather.empty());
+  int cnt = 0;
+  for (auto item = gather.begin(); item != gather.end(); ++item) {
+    cnt++;
+    ASSERT_TRUE(*item);
+  }
+  ASSERT_EQ(cnt, 2);
+}
+
+TEST(gather_x86, int32_init) {
+  GatherCompute<float, int32_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+
+TEST(gather_x86, int64_init) {
+  GatherCompute<float, int64_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+
+template <typename T>
+void test_case_1dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+
+  std::vector<float> ref_data{1, 3, 5};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+template <typename T>
+void test_case_2dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10, 20};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3, 20};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+
+  std::vector<float> ref_data(60);
+  for (int i = 0; i < 20; ++i) {
+    ref_data[i] = static_cast<float>(20 + i);
+  }
+  for (int i = 20; i < 40; ++i) {
+    ref_data[i] = static_cast<float>(40 + i);
+  }
+  for (int i = 40; i < 60; ++i) {
+    ref_data[i] = static_cast<float>(60 + i);
+  }
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+TEST(gather_x86, run_test_1dims) {
+  test_case_1dims<int32_t>();
+  test_case_1dims<int64_t>();
+}
+
+TEST(gather_x86, run_test_2dims) {
+  test_case_2dims<int32_t>();
+  test_case_2dims<int64_t>();
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, int64_in);
diff --git a/lite/kernels/x86/layer_norm_compute.cc b/lite/kernels/x86/layer_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4854a69a1d5f38bff102d984f990aea4ad723439
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/layer_norm_compute.h"
+
+REGISTER_LITE_KERNEL(layer_norm,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::LayerNormCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbbdb91debfd7d7b046a3eb18a535462c69e358c
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<param_t>();
+    float epsilon = param.epsilon;
+    auto Scale = param.Scale;
+    auto Bias = param.Bias;
+    auto x = param.X;
+
+    auto y = param.Y;
+    auto Mean = param.Mean;
+    auto Var = param.Variance;
+    auto begin_norm_axis = param.begin_norm_axis;
+
+    auto x_dims = x->dims();
+
+    y->mutable_data<T>();
+    Mean->mutable_data<T>();
+    Var->mutable_data<T>();
+
+    auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    lite::DDim matrix_shape({left, right});
+
+    lite::Tensor in;
+    in.ShareDataWith(*x);
+    in.Resize(matrix_shape);
+    lite::Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    PADDLE_ENFORCE_EQ(Mean->numel(), left);
+    PADDLE_ENFORCE_EQ(Var->numel(), left);
+    PADDLE_ENFORCE_EQ(Scale->numel(), right);
+    PADDLE_ENFORCE_EQ(Bias->numel(), right);
+
+    auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
+                                              lite::fluid::CPUPlace>::Cache()
+                   .At(right);
+    ker(in.mutable_data<T>(),
+        out.mutable_data<T>(),
+        Mean->mutable_data<T>(),
+        Var->mutable_data<T>(),
+        Scale->data<T>(),
+        Bias->data<T>(),
+        static_cast<int>(left),
+        static_cast<const float>(epsilon),
+        right);
+  }
+
+  virtual ~LayerNormCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbac39505204b3799f6c5274f80690196e83a725
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/layer_norm_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+std::vector<float> ref(lite::Tensor* x,
+                       lite::Tensor* Scale,
+                       lite::Tensor* Bias,
+                       lite::Tensor* y,
+                       lite::Tensor* Mean,
+                       lite::Tensor* Var,
+                       int begin_norm_axis,
+                       float epsilon) {
+  auto x_dims = x->dims();
+
+  y->mutable_data<float>();
+  Mean->mutable_data<float>();
+  Var->mutable_data<float>();
+
+  auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  lite::DDim matrix_shape({left, right});
+
+  x->Resize(matrix_shape);
+  Tensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+  auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<float>,
+                                            lite::fluid::CPUPlace>::Cache()
+                 .At(right);
+  ker(x->mutable_data<float>(),
+      out.mutable_data<float>(),
+      Mean->mutable_data<float>(),
+      Var->mutable_data<float>(),
+      Scale->data<float>(),
+      Bias->data<float>(),
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+
+  std::vector<float> ref_data;
+  auto result = out.mutable_data<float>();
+  for (int i = 0; i < y->dims().production(); ++i) {
+    ref_data.emplace_back(result[i]);
+  }
+  return ref_data;
+}
+
+// layer_norm
+TEST(layer_norm_x86, retrive_op) {
+  auto layer_norm =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "layer_norm");
+  ASSERT_FALSE(layer_norm.empty());
+  ASSERT_TRUE(layer_norm.front());
+}
+
+TEST(layer_norm_x86, init) {
+  lite::kernels::x86::LayerNormCompute<float> layer_norm;
+  ASSERT_EQ(layer_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(layer_norm.target(), TARGET(kX86));
+}
+
+TEST(layer_norm_x86, run_test) {
+  lite::Tensor x;
+  lite::Tensor Scale;
+  lite::Tensor Bias;
+
+  lite::Tensor out;
+  lite::Tensor Mean;
+  lite::Tensor Var;
+
+  std::vector<int64_t> x_shape({1, 2, 3, 1});
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape({1, 2, 3, 1});
+  out.Resize(lite::DDim(out_shape));
+
+  int begin_norm_axis = 0;
+  float epsilon = 1e-5;
+  int pre = 1;
+  int post = 1;
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    pre *= x_shape[i];
+  }
+  for (int i = begin_norm_axis; i < x_shape.size(); ++i) {
+    post *= x_shape[i];
+  }
+  std::vector<int64_t> scale_shape({post});
+  Scale.Resize(scale_shape);
+  std::vector<int64_t> bias_shape({post});
+  Bias.Resize(bias_shape);
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = Scale.mutable_data<float>();
+  auto bias_data = Bias.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto mean_data = Mean.mutable_data<float>();
+  auto var_data = Var.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < Scale.dims().production(); ++i) {
+    scale_data[i] = 1.5;
+  }
+  for (int64_t i = 0; i < Bias.dims().production(); ++i) {
+    bias_data[i] = 0.25;
+  }
+
+  LayerNormCompute<float> layer_norm;
+  operators::LayerNormParam param;
+
+  param.X = &x;
+  param.Y = &out;
+  param.Scale = &Scale;
+  param.Bias = &Bias;
+  param.Mean = &Mean;
+  param.Variance = &Var;
+  param.begin_norm_axis = begin_norm_axis;
+  param.epsilon = epsilon;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  layer_norm.SetContext(std::move(ctx));
+  layer_norm.SetParam(param);
+  layer_norm.Run();
+
+  std::vector<float> ref_data =
+      ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon);
+  for (int j = 0; j < out.dims().production(); ++j) {
+    EXPECT_NEAR(out_data[j], ref_data[j], 1e-5);
+    // LOG(INFO) << out_data[j];
+  }
+  LOG(INFO) << *mean_data;
+  LOG(INFO) << *var_data;
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(layer_norm, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/lookup_table_compute.cc b/lite/kernels/x86/lookup_table_compute.cc
index 364593251e17453011bad5b2c1057fc25d54d7c8..856a07a94cada4702d47820605436cee6523a527 100644
--- a/lite/kernels/x86/lookup_table_compute.cc
+++ b/lite/kernels/x86/lookup_table_compute.cc
@@ -32,3 +32,13 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kX86,
+                     kInt64,
+                     kNCHW,
+                     paddle::lite::kernels::x86::LookupTableCompute<float>,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h
index e0d7752ca77c810700f57722c4186b4e02d6411f..d5719f332ce4b0b590b0cab26c5a98e864d2cc5e 100644
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -30,7 +30,6 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
 
   void Run() override {
     auto &param = *param_.get_mutable<operators::LookupTableParam>();
-    // auto& context = context_->As<X86Context>();
     auto *ids_t = param.Ids;
     auto *output_t = param.Out;
     int64_t padding_idx = param.padding_idx;
@@ -41,18 +40,18 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
     int64_t row_number = table_t->dims()[0];
     int64_t row_width = table_t->dims()[1];
 
-    auto *table = table_t->data<float>();
-    auto *output = output_t->mutable_data<float>();
-    memset(output, 0, output_t->dims().production() * sizeof(float));
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>();
+    memset(output, 0, output_t->dims().production() * sizeof(T));
     for (int64_t i = 0; i < ids_numel; ++i) {
       if (padding_idx != -1 && ids[i] == padding_idx) {
-        memset(output + i * row_width, 0, row_width * sizeof(float));
+        memset(output + i * row_width, 0, row_width * sizeof(T));
       } else {
         CHECK_LT(ids[i], row_number);
         CHECK_GE(ids[i], 0);
         memcpy(output + i * row_width,
                table + ids[i] * row_width,
-               row_width * sizeof(float));
+               row_width * sizeof(T));
       }
     }
   }
diff --git a/lite/kernels/x86/lookup_table_compute_test.cc b/lite/kernels/x86/lookup_table_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86b2d39186b10de6def72a217cd6c70773b59420
--- /dev/null
+++ b/lite/kernels/x86/lookup_table_compute_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(lookup_table_x86, compute) {
+  LookupTableCompute<float> lookup_table;
+  operators::LookupTableParam param;
+  lite::Tensor w, ids, out, out_ref;
+  int64_t padding_idx = -1;
+
+  int vocab_size = 40;
+  int emb_size = 50;
+  int ids_h = 30;
+  int ids_w = 20;
+
+  auto w_dim = DDim({vocab_size, emb_size});
+  auto ids_dim = DDim({ids_h, ids_w});
+  auto out_dim = DDim({ids_h, ids_w, emb_size});
+
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto* w_data = w.mutable_data<float>();
+  auto* ids_data = ids.mutable_data<int64_t>();
+  auto* out_data = out.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  int w_num = w_dim.production();
+  for (int i = 0; i < w_num; i++) {
+    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  int ids_num = ids_dim.production();
+  for (int i = 0; i < ids_num; i++) {
+    ids_data[i] = i % vocab_size;
+  }
+  int out_num = out_dim.production();
+  for (int i = 0; i < out_num; i++) {
+    out_ref_data[i] =
+        static_cast<float>((i % (vocab_size * emb_size)) + 1) / (w_num + 1);
+  }
+
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  param.padding_idx = padding_idx;
+  lookup_table.SetParam(param);
+  lookup_table.Run();
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def);
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..feda180d22e59b2ca0e8f0f89f3c7a1ddb8acd4a
--- /dev/null
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/match_matrix_tensor_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void MatchMatrixTensorCompute<T>::Run() {
+  auto& context = ctx_->As<X86Context>();
+  auto& param = this->Param<param_t>();
+  auto* x = param.x;
+  auto* w = param.w;
+  auto* y = param.y;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  int dim_in = x->dims()[1];
+
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+
+  auto* bottom_l_data = x->template data<T>();
+  auto* bottom_r_data = y->template data<T>();
+  auto* t_data = w->template data<T>();
+  auto* out_data = out->template mutable_data<T>();
+  auto* bottom_l_trans_data = tmp->template mutable_data<T>();
+  memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T));
+  memset(bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
+
+  auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+  blas.GEMM(CblasNoTrans,
+            CblasNoTrans,
+            x->dims()[0],
+            dim_t * dim_in,
+            dim_in,
+            1.0f,
+            bottom_l_data,
+            dim_in,
+            t_data,
+            dim_t * dim_in,
+            0.0f,
+            bottom_l_trans_data,
+            dim_t * dim_in);
+
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    for (int t = 0; t < dim_t; t++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
+      const auto* l_t_data =
+          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
+      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
+
+      auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+      blas.GEMM(CblasNoTrans,
+                CblasTrans,
+                len_l,
+                len_r,
+                dim_in,
+                1.0f,
+                l_t_data,
+                dim_t * dim_in,
+                r_data,
+                dim_in,
+                0.0f,
+                top_data,
+                len_r);
+    }
+  }
+
+  int batch_size = x->lod()[0].size() - 1;
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    match_matrix_tensor,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::MatchMatrixTensorCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.h b/lite/kernels/x86/match_matrix_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6189676fd846e2ac73fb17ffb966cdf815d9a371
--- /dev/null
+++ b/lite/kernels/x86/match_matrix_tensor_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  void Run() override;
+
+  virtual ~MatchMatrixTensorCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c3f3ad50940ab0059ab04fb507a786f735584b9
--- /dev/null
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/match_matrix_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(match_matrix_tensor_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "match_matrix_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(match_matrix_tensor_x86, init) {
+  MatchMatrixTensorCompute<float> mmtc;
+  ASSERT_EQ(mmtc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mmtc.target(), TARGET(kX86));
+}
+
+TEST(match_matrix_tensor_x86, run_test) {
+  int ix = 5, iy = 4, h = 2, dim_t = 2;
+  lite::Tensor x, w, y, out, tmp;
+  x.Resize({ix, h});
+  w.Resize({h, dim_t, h});
+  y.Resize({iy, h});
+  out.Resize({18, 1});
+  tmp.Resize({20, 1});
+
+  LoD x_lod{};
+  x_lod.push_back({0, 2, 5});
+  x.set_lod(x_lod);
+  LoD y_lod{};
+  y_lod.push_back({0, 3, 4});
+  y.set_lod(y_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* y_data = y.mutable_data<float>();
+  for (int64_t i = 0; i < y.numel(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  auto* w_data = w.mutable_data<float>();
+  for (int64_t i = 0; i < w.numel(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  MatchMatrixTensorCompute<float> mmtc;
+  mmtc.SetContext(std::move(ctx));
+
+  operators::MatchMatrixTensorParam param;
+  param.x = &x;
+  param.w = &w;
+  param.y = &y;
+  param.dim_t = dim_t;
+  param.out = &out;
+  param.tmp = &tmp;
+
+  mmtc.SetParam(param);
+  mmtc.Run();
+
+  std::vector<float> ref_results = {5,
+                                    23,
+                                    41,
+                                    17,
+                                    75,
+                                    133,
+                                    7,
+                                    33,
+                                    59,
+                                    27,
+                                    125,
+                                    223,
+                                    323,
+                                    455,
+                                    587,
+                                    557,
+                                    793,
+                                    1029};
+  auto* out_data = out.mutable_data<float>();
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    // LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(match_matrix_tensor, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/mean_compute.cc b/lite/kernels/x86/mean_compute.cc
index b618d2d3775e148c4b5f2c864eaa4de2dc40c08a..1216d99ad807c673ee6aa764fd895732540d86c5 100644
--- a/lite/kernels/x86/mean_compute.cc
+++ b/lite/kernels/x86/mean_compute.cc
@@ -54,29 +54,6 @@ class MeanCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~MeanCompute() = default;
 };
 
-template <typename T>
-class MeanGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MeanGradParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1);
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    T x_grad_size = static_cast<T>(param.X_grad->raw_tensor().numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(x_grad_size));
-    EigenVector<T>::Flatten(param.X_grad->raw_tensor())
-        .device(*(context.x86_device_context()->eigen_device())) =
-        (EigenVector<T>::From(param.Out_grad->raw_tensor()) / x_grad_size)
-            .broadcast(bcast);
-  }
-
-  virtual ~MeanGradCompute() = default;
-};
-
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
@@ -93,16 +70,3 @@ REGISTER_LITE_KERNEL(mean,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(mean_grad,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::MeanGradCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/mul_compute.cc b/lite/kernels/x86/mul_compute.cc
index 64558f66772381ad402a3eb203bb6efd9fceff60..3de4340543cff6867f7879f0551be7a33c9e6862 100644
--- a/lite/kernels/x86/mul_compute.cc
+++ b/lite/kernels/x86/mul_compute.cc
@@ -24,21 +24,3 @@ REGISTER_LITE_KERNEL(mul,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-// #ifdef LITE_WITH_TRAIN
-// REGISTER_LITE_KERNEL(mul_grad,
-//                      kX86,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::x86::MulGradCompute<float>,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput(paddle::framework::GradVarName("Out"),
-//                {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("X"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("Y"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .Finalize();
-// #endif
diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h
index e204fc81f28de4af43d63e289b01d81188502988..be58f24ba2ed37db6661ecaaceb0d9d70fdd75d4 100644
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -81,78 +81,6 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~MulCompute() = default;
 };
 
-#ifdef LITE_WITH_TRAIN
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-
-    Tensor x_matrix, y_matrix;
-
-    if (x->dims().size() > 2) {
-      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
-    } else {
-      x_matrix = *x;
-    }
-
-    if (y->dims().size() > 2) {
-      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
-
-    } else {
-      y_matrix = *y;
-    }
-
-    auto* dout = &param.output_grad->raw_tensor();
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-
-  virtual ~MulGradCompute() = default;
-};
-#endif
-
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/x86/pool_compute.h b/lite/kernels/x86/pool_compute.h
index 57bcddcec9512d626962465e717b7a202cfe0b17..0dccb245b1267ac7ffa7c75bda9b491ffc3cd191 100644
--- a/lite/kernels/x86/pool_compute.h
+++ b/lite/kernels/x86/pool_compute.h
@@ -35,7 +35,6 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     if (param.global_pooling) {
       for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
         param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
       }
     }
@@ -52,7 +51,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                          param.x,
                          param.ksize,
                          param.strides,
-                         param.paddings,
+                         *param.paddings,
                          pool_process,
                          true,
                          false,
@@ -68,7 +67,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                          param.x,
                          param.ksize,
                          param.strides,
-                         param.paddings,
+                         *param.paddings,
                          pool_process,
                          param.exclusive,
                          param.adaptive,
diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc
index 87b75a0760bca45057f25b2cb948a66feb22496c..4ea727cedd5206f5f1ac2685297f72c3019bb313 100644
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
@@ -60,7 +60,8 @@ TEST(pool2d_x86, run_test) {
   param.x = &x;
   param.output = &out;
   param.strides = {2, 2};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
   param.ksize = {2, 2};
   param.pooling_type = "max";
   std::unique_ptr<KernelContext> ctx(new KernelContext);
diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.cc b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..956f2a3beb8ae845b71c31600fdf8e6c758cab6a
--- /dev/null
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_aligned_mat_mul_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_aligned_mat_mul,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SearchAlignedMatMulCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.h b/lite/kernels/x86/search_aligned_mat_mul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea6b546c2ccefbb98269ad563a566cc668e6a441
--- /dev/null
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchAlignedMatMulCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MatMulParam>();
+
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    float alpha = param.alpha;
+    const auto x_dims = x->dims();
+    const auto y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose ? x_inner_size : x_batch_size;
+    int N = y_transpose ? y_batch_size : y_inner_size;
+    int X_K = x_transpose ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+
+    lite::x86::math::MatDescriptor mat_dim_a;
+    mat_dim_a.height_ = M;
+    mat_dim_a.width_ = K;
+    mat_dim_a.stride_ = x_batch_size * x_inner_size;
+    mat_dim_a.batch_size_ = seq_num;
+    mat_dim_a.trans_ = x_transpose;
+    lite::x86::math::MatDescriptor mat_dim_b;
+    mat_dim_b.height_ = K;
+    mat_dim_b.width_ = N;
+    mat_dim_b.stride_ = y_batch_size * y_inner_size;
+    mat_dim_b.batch_size_ = seq_num;
+    mat_dim_b.trans_ = y_transpose;
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    blas.MatMul(*x, mat_dim_a, *y, mat_dim_b, static_cast<T>(alpha), out, T(0));
+  }
+
+  virtual ~SearchAlignedMatMulCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_fc_compute.cc b/lite/kernels/x86/search_fc_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf76113e01d81e899250a60203680cd984746f19
--- /dev/null
+++ b/lite/kernels/x86/search_fc_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_fc_compute.h"
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SearchFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_fc_compute.h b/lite/kernels/x86/search_fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0f44de526be102ac7be4f44517d01e0bc28ff94
--- /dev/null
+++ b/lite/kernels/x86/search_fc_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/search_fc.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<param_t>();
+
+    param.Out->Resize({param.X->dims()[0], param.out_size});
+    lite::x86::math::SearchFcFunctor<lite::TargetType::kX86, T> search_fc;
+    search_fc(context, *param.X, *param.W, *param.b, param.Out, param.out_size);
+  }
+  virtual ~SearchFcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..425df2a0f0544d7345923cb2efdce96074845311
--- /dev/null
+++ b/lite/kernels/x86/search_fc_compute_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_fc_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void fc_cpu_base(const lite::Tensor* X,
+                 const lite::Tensor* W,
+                 const lite::Tensor* b,
+                 int out_size,
+                 lite::Tensor* Out) {
+  const float* data_in = X->data<float>();
+  const float* bias = b->data<float>();
+  const float* weights = W->data<float>();
+  float* data_out = Out->mutable_data<float>();
+  int out_rows = X->dims()[0];
+  int in_cols = X->numel() / out_rows;
+  int out_cols = W->numel() / in_cols;
+  int index_out;
+
+  for (int i = 0; i < out_rows; i++) {
+    for (int j = 0; j < out_cols; j++) {
+      index_out = i * out_cols + j;
+      data_out[index_out] = bias ? bias[j] : 0;
+
+      for (int k = 0; k < in_cols; k++) {
+        data_out[index_out] +=
+            data_in[i * in_cols + k] * weights[j * in_cols + k];
+      }
+    }
+  }
+}
+
+TEST(search_fc_x86, retrive_op) {
+  auto search_fc =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_fc");
+  ASSERT_FALSE(search_fc.empty());
+  ASSERT_TRUE(search_fc.front());
+}
+
+TEST(search_fc_x86, init) {
+  SearchFcCompute<float> search_fc;
+  ASSERT_EQ(search_fc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(search_fc.target(), TARGET(kX86));
+}
+
+TEST(search_fc_x86, run_test) {
+  lite::Tensor x, w, b, out;
+  lite::Tensor out_ref;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  std::vector<int64_t> x_shape{1, 4};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  w.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{3};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  out.Resize(lite::DDim(out_shape));
+  out_ref.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto out_data_ref = out_ref.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < w.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+
+  fc_cpu_base(&x, &w, &b, 4, &out_ref);
+
+  SearchFcCompute<float> fc;
+  operators::SearchFcParam param;
+  param.X = &x;
+  param.W = &w;
+  param.b = &b;
+  param.Out = &out;
+  param.out_size = 4;
+  fc.SetParam(param);
+  fc.SetContext(std::move(ctx));
+  fc.Run();
+
+  VLOG(3) << "output vs ref";
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_data_ref[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_fc, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95839ba71b9f63fad9d659fd65c0028005d29799
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_grnn_compute.h"
+#include <algorithm>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+T sigmoid(T z) {
+  return 1 / (1 + std::exp(-z));
+}
+
+template <typename T>
+void CallGemm(const lite::x86::math::BlasT<TARGET(kX86), T>& blas,
+              const CBLAS_TRANSPOSE TransA,
+              const CBLAS_TRANSPOSE TransB,
+              const int M,
+              const int N,
+              const int K,
+              const T alpha,
+              const T* A,
+              const T* B,
+              const T beta,
+              T* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
+  auto& param = this->Param<param_t>();
+  auto* _idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* _layout_input = param.layout_input;
+  auto* _input = input_blob;
+
+  // usually total length
+  int dim0 = _input->dims()[0];
+  // if it is id only sequence
+  int dim1 = 1;
+  // if its a embedding like sequence (dim1 would be embedding_size)
+  if (_input->dims().size() > 1) {
+    dim1 = _input->dims()[1];
+  }
+
+  int batch = _input->lod()[0].size() - 1;
+  auto& offset = _input->lod()[0];
+
+  Tensor _width;
+  _width.Resize({batch});
+  _idx_sorted_by_width->Resize({batch});
+  int* width_data = _width.template mutable_data<int>();
+  int* idx_sorted_by_width_data =
+      _idx_sorted_by_width->template mutable_data<int>();
+  // sort sequence by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width_data[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_data[i] = i;
+  }
+  std::sort(idx_sorted_by_width_data,
+            idx_sorted_by_width_data + batch,
+            [&_width](int a, int b) {
+              return _width.template data<int>()[a] >
+                     _width.template data<int>()[b];
+            });
+  int max_width = width_data[idx_sorted_by_width_data[0]];
+
+  // start of reorganizing the input
+  std::vector<size_t> new_offset;
+  new_offset.resize(max_width + 1);
+
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width_data[idx_sorted_by_width_data[k]] > last_width) {
+        sub_row = width_data[idx_sorted_by_width_data[k]] - last_width;
+        sub_col = k + 1;
+
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width_data[idx_sorted_by_width_data[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  if (_input->dims().size() == 1) {
+    // _layout_input.reshape_batch_sequence({dim0}, new_offset);
+    LOG(FATAL) << "_input->dims().size() = 1, error.";
+  } else {
+    // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    _layout_input->set_lod(new_lod);
+    _layout_input->Resize({dim0, dim1});
+  }
+
+  auto* new_emb = _layout_input->template mutable_data<T>();
+  for (int i = 0; i < max_width; i++) {
+    int w = new_offset[i + 1] - new_offset[i];
+    auto* emb_start = new_emb + dim1 * new_offset[i];
+    for (int j = 0; j < w; ++j) {
+      memcpy(emb_start + dim1 * j,
+             _input->template data<T>() +
+                 dim1 * offset[idx_sorted_by_width_data[j]] + dim1 * i,
+             dim1 * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::CopyBack(T* from, T* to, int step) {
+  auto& param = this->Param<param_t>();
+  auto* _input = param.x;
+  auto* _layout_input = param.layout_input;
+  auto* _idx_sorted_by_width = param.idx_sorted_by_width;
+
+  const auto& offset = _input->lod()[0];
+  const auto& new_offset = _layout_input->lod()[0];
+  const auto* idx_sorted_by_width_data =
+      _idx_sorted_by_width->template data<int>();
+  for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+    int w = new_offset[i + 1] - new_offset[i];
+    for (int j = 0; j < w; j++) {
+      memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i),
+             from + (new_offset[i] + j) * step,
+             step * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::Run() {
+  auto& context = ctx_->As<X86Context>();
+  auto& param = this->Param<param_t>();
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* _buffer = param.tmp_buffer;
+  int _cap_h = param.num_hidden;
+  int _cap_e = param.num_input;
+
+  int _cap_l = bottom->dims()[0];
+  int batch = bottom->lod()[0].size() - 1;
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->template mutable_data<T>();
+
+  const auto* dense_e2h = wi->template data<T>();
+  const auto* dense_h2h = wh->template data<T>();
+
+  const auto* e2h = dense_e2h;
+  const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+  const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+  const auto* h2h = dense_h2h;
+  const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+  const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+  PrepareLayout(bottom);
+
+  auto* _layout_input = param.layout_input;
+  auto* new_emb = _layout_input->template mutable_data<T>();
+  const auto& new_offset = _layout_input->lod()[0];
+  int max_width = _layout_input->lod()[0].size() - 1;
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  _buffer->Resize({20, _cap_l, _cap_h});
+  auto* buffer_data = _buffer->template mutable_data<T>();
+  auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
+  auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
+  auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
+  auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
+  auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
+  auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
+  auto* r = buffer_data + 6 * _cap_l * _cap_h;
+  auto* z = buffer_data + 7 * _cap_l * _cap_h;
+  auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
+
+  auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2h,
+           0.0f,
+           w_x_e);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2hr,
+           0.0f,
+           wr_x_e);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2hz,
+           0.0f,
+           wz_x_e);
+
+  // precompute hidden0
+  for (int i = 0; i < batch * _cap_h; i++) {
+    tilde[i] = std::tanh(w_x_e[i]);
+    z[i] = sigmoid<T>(wz_x_e[i]);
+    hidden[i] = (1. - z[i]) * tilde[i];
+  }
+
+  // recurrence
+  for (int i = 1; i < max_width; i++) {
+    int w_tm1 = new_offset[i] - new_offset[i - 1];
+    int w = new_offset[i + 1] - new_offset[i];
+
+    // precompute hidden i-1 to hidden i
+    auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
+
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2h,
+             0.0f,
+             u_x_h + new_offset[i] * _cap_h);
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2hr,
+             0.0f,
+             ur_x_h + new_offset[i] * _cap_h);
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2hz,
+             0.0f,
+             uz_x_h + new_offset[i] * _cap_h);
+
+    // compute the gate and hidden
+    for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h;
+         j++) {
+      r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]);
+      z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]);
+      tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
+      hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+    }
+  }
+
+  CopyBack(hidden, top_hidden, _cap_h);
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SearchGrnnCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_grnn_compute.h b/lite/kernels/x86/search_grnn_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..66866761e139479863d98dd757d1a90ae36de9f5
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchGrnnCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+
+  void Run() override;
+
+  virtual ~SearchGrnnCompute() = default;
+
+ private:
+  void PrepareLayout(const Tensor* input);
+  void CopyBack(T* from, T* to, int step);
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b85d97e3f1be1f2f02837d347e42ce6731c58414
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_grnn_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_grnn_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_grnn");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_grnn_x86, init) {
+  SearchGrnnCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kX86));
+}
+
+TEST(search_grnn_x86, run_test) {
+  int num_input = 128;
+  int num_hidden = 128;
+  int num_batch = 3;
+  lite::Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  // out.Resize({num_batch, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* wi_data = wi.mutable_data<float>();
+  for (int64_t i = 0; i < wi.numel(); i++) {
+    wi_data[i] = static_cast<float>(i);
+  }
+  auto* wh_data = wh.mutable_data<float>();
+  for (int64_t i = 0; i < wh.numel(); i++) {
+    wh_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::SearchGrnnParam param;
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+
+  SearchGrnnCompute<float> sgc;
+  sgc.SetContext(std::move(ctx));
+  sgc.SetParam(param);
+  sgc.Run();
+
+  // std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  LOG(INFO) << out.numel();
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_grnn, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_group_padding_compute.cc b/lite/kernels/x86/search_group_padding_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1847ac9dbafc533b8720ab65e6fa1915d5a136e
--- /dev/null
+++ b/lite/kernels/x86/search_group_padding_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_group_padding_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_group_padding,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SearchGroupPaddingCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_emb_padding", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_new", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_padding", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..17244d15d9124d9d61d1f4fdef4f12590958c0be
--- /dev/null
+++ b/lite/kernels/x86/search_group_padding_compute.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchGroupPaddingCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGroupPaddingParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SearchGroupPaddingParam>();
+
+    auto* bottom0 = param.x;
+    auto* top0 = param.out_emb_padding;
+    auto* top1 = param.out_new;
+    auto* top2 = param.out_padding;
+
+    int _pad_id = param.pad_id;
+
+    int batch = bottom0->lod()[0].size() - 1;
+    int dim0 = bottom0->dims()[0];
+    int dim1 = bottom0->dims()[1];
+
+    const auto offset = bottom0->lod()[0];
+    int max_seq = 0;
+    for (int i = 0; i < batch; ++i) {
+      if (offset[i + 1] - offset[i] > max_seq) {
+        max_seq = offset[i + 1] - offset[i];
+      }
+    }
+
+    std::vector<size_t> new_offset;
+    new_offset.resize(batch + 1);
+    for (int i = 0; i < batch + 1; ++i) {
+      new_offset[i] = i * max_seq;
+    }
+
+    // for padding data
+    lite::LoD top0_lod;
+    top0_lod.push_back(new_offset);
+    top0->set_lod(top0_lod);
+    top0->Resize({batch * max_seq, dim1});
+    // for origin input id
+    // already set by ShareLoD in InferShape
+    lite::LoD top1_lod;
+    top1_lod.push_back(offset);
+    top1->set_lod(top1_lod);
+    top1->Resize({dim0, 1});
+    memset(top1->mutable_data<T>(),
+           0,
+           top1->dims()[0] * top1->dims()[1] * sizeof(T));
+    // for padding input id
+    lite::LoD top2_lod;
+    top2_lod.push_back(new_offset);
+    top2->set_lod(top2_lod);
+    top2->Resize({batch * max_seq, 1});
+    // copy data
+    const auto* bottom_data = bottom0->data<T>();
+    auto* top_data = top0->mutable_data<T>();
+    auto* top_padding_input_data = top2->mutable_data<T>();
+    for (int i = 0; i < batch; i++) {
+      const int copy_step = offset[i + 1] - offset[i];
+      const int start = i * max_seq;
+      memcpy(top_data + start * dim1,
+             bottom_data + offset[i] * dim1,
+             copy_step * dim1 * sizeof(T));
+      memset(top_data + (start + copy_step) * dim1,
+             0,
+             (max_seq - copy_step) * dim1 * sizeof(T));
+      // for padding input id
+      memset(top_padding_input_data + start, 0, copy_step * sizeof(T));
+      for (int j = start + copy_step; j < start + max_seq; j++) {
+        top_padding_input_data[j] = static_cast<T>(_pad_id);
+      }
+    }
+  }
+
+  virtual ~SearchGroupPaddingCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4c36c2a63488a6bb902a2b8b4ad81fa32b37672
--- /dev/null
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_group_padding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_group_padding_x86, retrieve_op) {
+  auto search_group_padding =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_group_padding");
+  ASSERT_FALSE(search_group_padding.empty());
+  ASSERT_TRUE(search_group_padding.front());
+}
+
+TEST(search_group_padding_x86, init) {
+  SearchGroupPaddingCompute<float> search_group_padding;
+  ASSERT_EQ(search_group_padding.precision(), PRECISION(kFloat));
+  ASSERT_EQ(search_group_padding.target(), TARGET(kX86));
+}
+
+TEST(search_group_padding_x86, run_test) {
+  lite::Tensor x, out_emb_padding, out_new, out_padding;
+  x.Resize({2, 3});
+  out_emb_padding.Resize({-1, 3});
+  out_new.Resize({2, 1});
+  out_padding.Resize({-1, 1});
+  LoD x_lod{};
+  x_lod.push_back({0, 1});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  SearchGroupPaddingCompute<float> sgp_kernel;
+  operators::SearchGroupPaddingParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  sgp_kernel.SetContext(std::move(ctx));
+
+  param.x = &x;
+  param.out_emb_padding = &out_emb_padding;
+  param.out_new = &out_new;
+  param.out_padding = &out_padding;
+
+  sgp_kernel.SetParam(param);
+  sgp_kernel.Run();
+
+  std::vector<float> out_emb_padding_ref = {0, 1, 2};
+  std::vector<float> out_new_ref = {0, 0};
+  std::vector<float> out_padding_ref = {0};
+  auto* out_emb_padding_data = out_emb_padding.mutable_data<float>();
+  auto* out_new_data = out_new.mutable_data<float>();
+  auto* out_padding_data = out_padding.mutable_data<float>();
+  for (int i = 0; i < out_emb_padding.dims().production(); i++) {
+    EXPECT_NEAR(out_emb_padding_data[i], out_emb_padding_ref[i], 1e-5);
+  }
+  for (int i = 0; i < out_new.dims().production(); i++) {
+    EXPECT_NEAR(out_new_data[i], out_new_ref[i], 1e-5);
+  }
+  for (int i = 0; i < out_padding.dims().production(); i++) {
+    EXPECT_NEAR(out_padding_data[i], out_padding_ref[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_group_padding, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_seq_depadding_compute.cc b/lite/kernels/x86/search_seq_depadding_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db1816fb48fb85ade2b4ab0b96e7aa4de5236ced
--- /dev/null
+++ b/lite/kernels/x86/search_seq_depadding_compute.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_seq_depadding_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void SearchSeqDepaddingCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto* pad = param.pad;
+  auto* src = param.src;
+  auto* out = param.out;
+
+  const int pad_batch = pad->lod()[0].size() - 1;
+  const int src_batch = src->lod()[0].size() - 1;
+  if (pad_batch % src_batch != 0) {
+    LOG(FATAL) << "Mismatch batch size.";
+  }
+
+  const auto& pad_offset = pad->lod()[0];
+  const int pad_cap_e = pad->dims()[1];
+  const auto& src_offset = src->lod()[0];
+  const int src_cap_l = src->dims()[0];
+
+  LoD out_lod;
+  out_lod.push_back(src_offset);
+  out->set_lod(out_lod);
+  out->Resize({src_cap_l, pad_cap_e});
+
+  const auto* pad_data = pad->template data<T>();
+  auto* out_data = out->template mutable_data<T>();
+  for (int i = 0; i < src_batch; ++i) {
+    const int src_i_l = src_offset[i + 1] - src_offset[i];
+    const int pad_i_l = pad_offset[i + 1] - pad_offset[i];
+    if (pad_i_l < src_i_l) {
+      LOG(FATAL)
+          << "the length of padding seq input is less than source seq input.";
+    }
+    memcpy(out_data + src_offset[i] * pad_cap_e,
+           pad_data + pad_offset[i] * pad_cap_e,
+           src_i_l * pad_cap_e * sizeof(T));
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    search_seq_depadding,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SearchSeqDepaddingCompute<float>,
+    def)
+    .BindInput("Pad", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Src", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_seq_depadding_compute.h b/lite/kernels/x86/search_seq_depadding_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e48fa92723e332424df02cac3d044d4f2af129b8
--- /dev/null
+++ b/lite/kernels/x86/search_seq_depadding_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchSeqDepaddingCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqDepaddingParam;
+
+  void Run() override;
+
+  virtual ~SearchSeqDepaddingCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d978b35ed040d6b7c44354f37999e6e34e2e3ef
--- /dev/null
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_seq_depadding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_seq_depadding_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_seq_depadding");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_seq_depadding_x86, init) {
+  SearchSeqDepaddingCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kX86));
+}
+
+TEST(search_seq_depadding_x86, run_test) {
+  lite::Tensor pad, src, out;
+  pad.Resize({2 * 3, 4});
+  src.Resize({3, 1});
+  out.Resize({3, 4});
+  LoD pad_lod{};
+  pad_lod.push_back({0, 4, 6});
+  pad.set_lod(pad_lod);
+  LoD src_lod{};
+  src_lod.push_back({0, 2, 3});
+  src.set_lod(src_lod);
+
+  auto* pad_data = pad.mutable_data<float>();
+  for (int64_t i = 0; i < pad.dims().production(); i++) {
+    pad_data[i] = static_cast<float>(i);
+  }
+  SearchSeqDepaddingCompute<float> ssdc;
+  operators::SearchSeqDepaddingParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  ssdc.SetContext(std::move(ctx));
+
+  param.pad = &pad;
+  param.src = &src;
+  param.out = &out;
+
+  ssdc.SetParam(param);
+  ssdc.Run();
+
+  std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_seq_depadding, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_seq_fc_compute.cc b/lite/kernels/x86/search_seq_fc_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0845bd74c764b04f0e89353ff5c457965e5f115
--- /dev/null
+++ b/lite/kernels/x86/search_seq_fc_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_seq_fc_compute.h"
+
+REGISTER_LITE_KERNEL(search_seq_fc,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SearchSeqFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..80ef54b30b762848eceb16940c9f60ef8ba96927
--- /dev/null
+++ b/lite/kernels/x86/search_seq_fc_compute.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqFcParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::SearchSeqFcParam>();
+
+    auto x = param.x;
+    auto w = param.w;
+    auto b = param.b;
+    auto out = param.out;
+    auto out_size = param.out_size;
+    const auto x_dims = x->dims();
+    const auto w_dims = w->dims();
+    const auto out_dims = out->dims();
+    CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+    CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+    CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor.";
+    CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+    CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+    CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]";
+    CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size";
+
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    blas.MatMul(*x, false, *w, true, out);
+
+    if (b != nullptr) {
+      auto b_dims = b->dims();
+      CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+      CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+      int M = x_dims[0];
+      int N = w_dims[0];
+      for (int i = 0; i < M; i++) {
+        blas.AXPY(
+            N, static_cast<T>(1), b->data<T>(), out->mutable_data<T>() + i * N);
+      }
+    }
+  }
+
+  virtual ~SearchSeqFcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.cc b/lite/kernels/x86/sequence_arithmetic_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95fa27e3d4e7fdbf639a5b275568311907f8344d
--- /dev/null
+++ b/lite/kernels/x86/sequence_arithmetic_compute.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_arithmetic_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_arithmetic,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceArithmeticCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    search_seq_arithmetic,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceArithmeticCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..88510b8b1c7a04ab01da9af331f9d1f72765b215
--- /dev/null
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <cstring>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    int op_type = param.op_type;
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+
+    auto x_data = x->data<T>();
+    auto y_data = y->data<T>();
+    auto out_data = out->mutable_data<T>();
+    auto x_seq_offset = x->lod()[0];
+    auto y_seq_offset = y->lod()[0];
+    int seq_num = x_seq_offset.size() - 1;
+    int inner_size = (x->numel()) / (x->dims()[0]);
+
+    // sum
+    if (op_type == 1) {
+      for (int i = 0; i < seq_num; i++) {
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = std::min(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] + input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+
+    // sub
+    if (op_type == 2) {
+      for (int i = 0; i < seq_num; i++) {
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = std::min(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] - input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+
+    // mul
+    if (op_type == 3) {
+      for (int i = 0; i < seq_num; i++) {
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = std::min(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] * input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+  }
+
+  virtual ~SequenceArithmeticCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b41e7d7ce37ebaf6a3f8518bc248ff4ec5c1aec
--- /dev/null
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_arithmetic_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void sequence_arithmetic_compute_ref(const Tensor& x,
+                                     const Tensor& y,
+                                     Tensor* out,
+                                     int op_type) {
+  auto x_data = x.data<float>();
+  auto y_data = y.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  auto x_seq_offset = x.lod()[0];
+  auto y_seq_offset = y.lod()[0];
+  int seq_num = x_seq_offset.size() - 1;
+  int inner_size = x.numel() / x.dims()[0];
+
+  for (int i = 0; i < seq_num; i++) {
+    int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+    int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+    auto input_x = x_data + x_seq_offset[i] * inner_size;
+    auto input_y = y_data + y_seq_offset[i] * inner_size;
+    auto t_out = out_data + x_seq_offset[i] * inner_size;
+    int len = std::min(len_x, len_y);
+    for (int j = 0; j < len; j++) {
+      switch (op_type) {
+        case 1:
+          t_out[j] = input_x[j] + input_y[j];
+          break;
+        case 2:
+          t_out[j] = input_x[j] - input_y[j];
+          break;
+        case 3:
+          t_out[j] = input_x[j] * input_y[j];
+          break;
+        default:
+          break;
+      }
+    }
+    if (len_x > len) {
+      memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len));
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& x_lod) {
+  x->Resize({static_cast<int64_t>(x_lod[0].back()), 3});
+  x->set_lod(x_lod);
+  auto x_data = x->mutable_data<float>();
+  for (int i = 0; i < x->numel(); i++) {
+    x_data[i] = (i - x->numel() / 2) * 1.1;
+  }
+}
+
+TEST(sequence_arithmetic_x86, retrive_op) {
+  auto sequence_arithmetic =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_arithmetic");
+  ASSERT_FALSE(sequence_arithmetic.empty());
+  ASSERT_TRUE(sequence_arithmetic.front());
+}
+
+TEST(sequence_arithmetic_x86, init) {
+  SequenceArithmeticCompute<float> sequence_arithmetic;
+  ASSERT_EQ(sequence_arithmetic.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_arithmetic.target(), TARGET(kX86));
+}
+
+TEST(sequence_arithmetic_x86, run_test) {
+  SequenceArithmeticCompute<float> sequence_arithmetic;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  lite::Tensor x, y, out, out_ref;
+  lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}};
+  prepare_input(&x, x_lod);
+  prepare_input(&y, y_lod);
+
+  operators::SequenceArithmeticParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.op_type = 1;
+
+  sequence_arithmetic.SetContext(std::move(ctx));
+  sequence_arithmetic.SetParam(param);
+  sequence_arithmetic.Run();
+
+  sequence_arithmetic_compute_ref(x, y, &out_ref, param.op_type);
+  auto out_data = out.data<float>();
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_arithmetic, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_concat_compute.cc b/lite/kernels/x86/sequence_concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..facdad39d383c3a2134599e1490c89e9d5afa543
--- /dev/null
+++ b/lite/kernels/x86/sequence_concat_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_concat_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceConcatCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..553e2e8b0667106f25685a9ef155d7e61a672f31
--- /dev/null
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<T>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+template <typename T>
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    // auto& param = Param<param_t>();
+    T* dout = param.Out->mutable_data<T>();
+
+    std::vector<lite::Tensor> x_in_order;
+    param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));
+
+    int num = x_in_order.size();
+    int out_rows = 1;
+
+    std::vector<int64_t> input_cols(num);
+    for (int i = 0; i < num; ++i) {
+      input_cols[i] = x_in_order[i].numel() / out_rows;
+    }
+
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto input_data = x_in_order[j].data<T>();
+      memcpy(dout + col_idx, input_data, sizeof(T) * col_len);
+      col_idx += col_len;
+    }
+  }
+
+  virtual ~SequenceConcatCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be1f86a5c848b5c03634ea2a1aed0d57f2283879
--- /dev/null
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_concat_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace {
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<float>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
+                                lite::Tensor* out) {
+  std::vector<int64_t> out_dims;
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  for (const auto& tensor : xs) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD(xs, &x_in_order));
+
+  int num = x_in_order.size();
+  std::vector<int64_t> input_cols(num);
+  for (int i = 0; i < num; ++i) {
+    input_cols[i] = x_in_order[i].numel();
+  }
+  float* out_data = out->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = x_in_order[j].data<float>();
+    memcpy(out_data + col_idx, input_data, sizeof(float) * col_len);
+    col_idx += col_len;
+  }
+}
+
+#define PREPARE_INPUT(name)                        \
+  name.Resize({name##_lod_len, feature_len});      \
+  name.set_lod(lod_info_##name);                   \
+  float* name##_data = name.mutable_data<float>(); \
+  for (int i = 0; i < name.numel(); ++i) {         \
+    name##_data[i] = (i - 2.0) * 1.0;              \
+  }
+
+}  // namespace
+
+TEST(sequence_concat_x86, retrive_op) {
+  auto sequence_concat =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_concat");
+  ASSERT_FALSE(sequence_concat.empty());
+  ASSERT_TRUE(sequence_concat.front());
+}
+
+TEST(sequence_concat_x86, init) {
+  SequenceConcatCompute<float> sequence_concat;
+  ASSERT_EQ(sequence_concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_concat.target(), TARGET(kX86));
+}
+
+TEST(sequence_concat_x86, run_test) {
+  SequenceConcatCompute<float> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::SequenceConcatParam param;
+  lite::Tensor x1, x2, x3;
+  lite::Tensor y, y_ref;
+
+  int32_t x1_lod_len = 10, feature_len = 4;
+  int32_t x2_lod_len = 4, x3_lod_len = 8;
+  int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len;
+  LoD lod_info_x1{{0, 3, 5, 6, 10}};
+  LoD lod_info_x2{{0, 1, 2, 3, 4}};
+  LoD lod_info_x3{{0, 2, 4, 6, 8}};
+  LoD lod_info_y{{0, 0, 0, 0, 0}};
+  for (size_t i = 0; i < lod_info_x1[0].size(); ++i) {
+    lod_info_y[0][i] =
+        lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i];
+  }
+
+  PREPARE_INPUT(x1);
+  PREPARE_INPUT(x2);
+  PREPARE_INPUT(x3);
+
+  y_ref.Resize({y_lod_len, feature_len});
+  y.Resize({y_lod_len, feature_len});
+  y_ref.set_lod(lod_info_y);
+  y.set_lod(lod_info_y);
+
+  std::vector<lite::Tensor*> xs{&x1, &x2, &x3};
+
+  param.X = xs;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+
+  auto* y_data = y.mutable_data<float>();
+  sequence_concat_ref(xs, &y_ref);
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_concat, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_reverse_compute.cc b/lite/kernels/x86/sequence_reverse_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c391e12ad1df671517c182509e415325bb8ce56
--- /dev/null
+++ b/lite/kernels/x86/sequence_reverse_compute.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_reverse_compute.h"
+
+typedef paddle::lite::kernels::x86::SequenceReverseCompute<float,
+                                                           PRECISION(kFloat)>
+    ReverseFp32;
+typedef paddle::lite::kernels::x86::SequenceReverseCompute<int64_t,
+                                                           PRECISION(kInt64)>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, ReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kX86, kInt64, kNCHW, ReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_reverse_compute.h b/lite/kernels/x86/sequence_reverse_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab93972276664acc8585bd150a53601c039ccf87
--- /dev/null
+++ b/lite/kernels/x86/sequence_reverse_compute.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T, PrecisionType Ptype>
+class SequenceReverseCompute : public KernelLite<TARGET(kX86), Ptype> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* output = param.Out;
+    const auto* din = param.X->template data<T>();
+
+    T* dout = output->template mutable_data<T>();
+    CHECK_NE(din, dout)
+        << "SequenceReverse Op does not support in-place operation";
+    const auto lod = param.X->lod()[param.X->lod().size() - 1];
+    const size_t lod_count = lod.size();
+
+    size_t limit = static_cast<size_t>(param.X->numel());
+    size_t row_numel = static_cast<size_t>(limit / param.X->dims()[0]);
+
+    for (size_t idx = 0; idx < lod_count - 1; ++idx) {
+      auto start_pos = lod[idx];
+      auto end_pos = lod[idx + 1];
+      for (auto pos = start_pos; pos < end_pos; ++pos) {
+        auto cur_pos = end_pos - pos - 1 + start_pos;
+        std::memcpy(dout + pos * row_numel,
+                    din + cur_pos * row_numel,
+                    row_numel * sizeof(T));
+      }
+    }
+    output->set_lod(param.X->lod());
+  }
+
+  virtual ~SequenceReverseCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b84241c8b19e3db57dd7ef6339496191a7486be
--- /dev/null
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_reverse_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace {
+static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
+  const auto* x_data = x->data<float>();
+  auto seq_offset = x->lod()[x->lod().size() - 1];
+  int width = x->numel() / x->dims()[0];
+  auto* y_data = y->mutable_data<float>();
+  for (int i = 0; i < seq_offset.size() - 1; ++i) {
+    auto start_pos = seq_offset[i];
+    auto end_pos = seq_offset[i + 1];
+    for (auto pos = start_pos; pos < end_pos; ++pos) {
+      auto cur_pos = end_pos - pos - 1 + start_pos;
+      std::memcpy(y_data + pos * width,
+                  x_data + cur_pos * width,
+                  width * sizeof(float));
+    }
+  }
+}
+}  // namespace
+
+TEST(sequence_reverse_x86, retrive_op) {
+  auto sequence_reverse =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_reverse");
+  ASSERT_FALSE(sequence_reverse.empty());
+  ASSERT_TRUE(sequence_reverse.front());
+}
+
+TEST(sequence_reverse_x86, init) {
+  SequenceReverseCompute<float, PRECISION(kFloat)> sequence_reverse;
+  ASSERT_EQ(sequence_reverse.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_reverse.target(), TARGET(kX86));
+}
+
+TEST(sequence_reverse_x86, run_test) {
+  SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+
+  operators::SequenceReverseParam param;
+  lite::Tensor x, x_ref;
+  lite::Tensor y, y_ref;
+
+  int32_t lod_len = 10, feature_len = 4;
+  LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}};
+
+  x.Resize({lod_len, feature_len});
+  x_ref.Resize({lod_len, feature_len});
+  y.Resize({lod_len, feature_len});
+  y_ref.Resize({lod_len, feature_len});
+  x.set_lod(lod_info);
+  x_ref.set_lod(lod_info);
+  y.set_lod(lod_info);
+  y_ref.set_lod(lod_info);
+
+  auto* y_data = y.mutable_data<float>();
+  float* x_data = x.mutable_data<float>();
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < x.numel(); ++i) {
+    x_ref_data[i] = (i - 2.0) * 1.0;
+    x_data[i] = (i - 2.0) * 1.0;
+  }
+
+  param.X = &x;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+
+  sequence_reverse_ref(&x_ref, &y_ref);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bd8b287507426798e0ec24f8854e812016b0054
--- /dev/null
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_topk_avg_pooling_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_topk_avg_pooling,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceTopkAvgPoolingCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.h b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..724415288a72932392d5726778830095c8810e15
--- /dev/null
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/sequence_topk_avg_pooling.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    lite::x86::math::SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T>
+        sequence_topk_avg_pooling;
+    sequence_topk_avg_pooling(*param.X,
+                              *param.ROW,
+                              *param.COLUMN,
+                              param.Out,
+                              param.pos,
+                              param.channel_num,
+                              param.topks);
+  };
+  virtual ~SequenceTopkAvgPoolingCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/softmax_compute.cc b/lite/kernels/x86/softmax_compute.cc
index a00aa6d566b4bd9f6a880ab5255f40c71bb1360c..3a2cdc29ed262740aec0efca9460800f57f43437 100644
--- a/lite/kernels/x86/softmax_compute.cc
+++ b/lite/kernels/x86/softmax_compute.cc
@@ -23,3 +23,13 @@ REGISTER_LITE_KERNEL(softmax,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SoftmaxCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/stack_compute.cc b/lite/kernels/x86/stack_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f69319a6ca44a7f1a191df16db6b9b6c29553ac
--- /dev/null
+++ b/lite/kernels/x86/stack_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/stack_compute.h"
+
+REGISTER_LITE_KERNEL(stack,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::StackCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..12a6c3490eff9d446de96366c8dd5fe6b2a4bd06
--- /dev/null
+++ b/lite/kernels/x86/stack_compute.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/stack_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::StackParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.X;
+    auto y = param.Out;
+
+    int axis = param.axis;
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+
+    int n = static_cast<int>(x.size());
+    auto y_data = y->mutable_data<T>();
+    std::vector<const T*> x_datas(n);
+    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data<T>();
+
+    int pre = 1, post = 1;
+    auto dim = x[0]->dims();
+    for (int i = 0; i < axis; ++i) pre *= dim[i];
+    for (int i = axis; i < dim.size(); ++i) post *= dim[i];
+
+    auto x_data_arr = x_datas.data();
+
+    size_t x_offset = 0;
+    size_t y_offset = 0;
+    for (int i = 0; i < pre; i++) {
+      for (int j = 0; j < n; j++) {
+        std::memcpy(
+            y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
+        y_offset += post;
+      }
+      x_offset += post;
+    }
+  }
+
+  virtual ~StackCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d105165a98f936b7a6973e57f5199977a0b8bed3
--- /dev/null
+++ b/lite/kernels/x86/stack_compute_test.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/stack_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+// stack
+TEST(stack_x86, retrive_op) {
+  auto stack =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("stack");
+  ASSERT_FALSE(stack.empty());
+  ASSERT_TRUE(stack.front());
+}
+
+TEST(stack_x86, init) {
+  lite::kernels::x86::StackCompute<float> stack;
+  ASSERT_EQ(stack.precision(), PRECISION(kFloat));
+  ASSERT_EQ(stack.target(), TARGET(kX86));
+}
+
+TEST(stack_x86, run_test) {
+  lite::Tensor x;
+  lite::Tensor out;
+  int num_input = 5;
+
+  std::vector<int64_t> x_shape({10, 20, 10});
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> out_shape({5, 10, 20, 10});
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<lite::Tensor*> input;
+  for (int i = 0; i < num_input; ++i) {
+    input.emplace_back(&x);
+  }
+
+  // StackCompute stack;
+  StackCompute<float> stack;
+  operators::StackParam param;
+
+  param.X = input;
+  param.Out = &out;
+  int axis = 0;
+  param.axis = axis;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  stack.SetContext(std::move(ctx));
+  stack.SetParam(param);
+  stack.Run();
+
+  int ref_data = 0;
+  for (int j = 0; j < out.dims().production(); ++j) {
+    EXPECT_NEAR(out_data[j], ref_data, 1e-5);
+    ref_data++;
+    ref_data = (ref_data >= 2000) ? (ref_data - 2000) : ref_data;
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(stack, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/var_conv_2d_compute.cc b/lite/kernels/x86/var_conv_2d_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48ae1b055efc85e16905e3201d467017fc650a5a
--- /dev/null
+++ b/lite/kernels/x86/var_conv_2d_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/var_conv_2d_compute.h"
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::VarConv2DCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c94cb2ca2d43a138b5769653d6cad2d52d420563
--- /dev/null
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -0,0 +1,213 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void Im2Col(const lite::Tensor& input, lite::Tensor* col) const {
+    auto& param = *param_.get_mutable<param_t>();
+    int input_channel = param.input_channel;
+    int kernel_h = param.kernel_h;
+    int kernel_w = param.kernel_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+    // auto* in_row = param.ROW;
+    // auto* in_col = param.COLUMN;
+
+    int batch = input.lod()[0].size() - 1;
+    const auto& bottom_offset = input.lod()[0];
+    // 2-D lod info.
+    // const auto& offset_x = in_col->lod()[0];
+    // const auto& offset_y = in_row->lod()[0];
+    const auto& offset_y = param.X->lod()[1];
+    const auto& offset_x = param.X->lod()[2];
+
+    // top offset is the whole size of each data sample
+    std::vector<uint64_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_x = top_im_x * top_im_y;
+      int top_y = input_channel * kernel_h * kernel_w;
+      top_size += top_y * top_x;
+      top_offset.push_back(top_size);
+    }
+    // std::vector<int64_t> col_lod_vec;
+    // col_lod_vec.push_back(top_offset);
+    LoD col_lod;
+    col_lod.push_back(top_offset);
+    col->set_lod(col_lod);
+    std::vector<int64_t> col_dims_vec{top_size};
+    col_dims_vec.push_back(1);
+    col->Resize(col_dims_vec);
+    auto* top_data = col->mutable_data<T>();
+    const auto* bottom_data = input.data<T>();
+
+    int kernel_win_size = kernel_h * kernel_w;
+    int half_kernel_h = kernel_h / 2;
+    int half_kernel_w = kernel_w / 2;
+    for (int b = 0; b < batch; ++b) {
+      int t_offset = top_offset[b];
+      int b_offset = bottom_offset[b];
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      if (width == 0 || height == 0) {
+        continue;
+      }
+      int top_im_x = (width - 1) / stride_w + 1;
+      int top_im_y = (height - 1) / stride_h + 1;
+      int top_x = top_im_y * top_im_x;
+      for (int z = 0; z < input_channel; ++z) {
+        int row_offset = kernel_win_size * z;
+        int im_offset = z * width * height;
+        for (int y = 0; y < height; y += stride_h) {
+          for (int x = 0; x < width; x += stride_w) {
+            int col_offset = x / stride_w + y / stride_h * top_im_x;
+            for (int ky = 0; ky < kernel_h; ++ky) {
+              for (int kx = 0; kx < kernel_w; ++kx) {
+                int im_y = y + ky - half_kernel_h;
+                int im_x = x + kx - half_kernel_w;
+                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] =
+                      bottom_data[b_offset + im_offset + im_y * width + im_x];
+                } else {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] = 0;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    auto* bottom = param.X;
+    // auto* in_row = param.ROW;
+    // auto* in_col = param.COLUMN;
+    auto* w = param.W;
+    auto* top = param.Out;
+    auto* col = param.Col;
+
+    int output_channel = param.output_channel;
+    int input_channel = param.input_channel;
+    int kernel_h = param.kernel_h;
+    int kernel_w = param.kernel_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+
+    Im2Col(*bottom, col);
+    int batch = bottom->lod()[0].size() - 1;
+    const auto& col_offset = col->lod()[0];
+    // const auto& offset_x = in_col->lod()[0];
+    // const auto& offset_y = in_row->lod()[0];
+    const auto& offset_y = param.X->lod()[1];
+    const auto& offset_x = param.X->lod()[2];
+    std::vector<size_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_im_size = top_im_y * top_im_x;
+      top_size += output_channel * top_im_size;
+      top_offset.push_back(top_size);
+    }
+
+    LoD top_lod;
+    top_lod.push_back(top_offset);
+    top->set_lod(top_lod);
+    std::vector<int64_t> top_dims_vec{top_size};
+    top_dims_vec.push_back(1);
+    top->Resize(top_dims_vec);
+    auto* top_data = top->mutable_data<T>();
+    const auto* w_data = w->data<T>();
+    const auto* col_data = col->data<T>();
+
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    for (int b = 0; b < batch; ++b) {
+      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+      if (top_im_size == 0) {
+        continue;
+      }
+
+      blas.GEMM(false,
+                false,
+                output_channel,
+                top_im_size,
+                input_channel * kernel_h * kernel_w,
+                1.0,
+                w_data,
+                input_channel * kernel_h * kernel_w,
+                col_data + col_offset[b],
+                top_im_size,
+                0.0,
+                top_data + top_offset[b],
+                top_im_size);
+    }
+  }
+
+  virtual ~VarConv2DCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6ae5a67bfc9deba1fb097fa5c0c0cf323b65e48
--- /dev/null
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -0,0 +1,315 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/var_conv_2d_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+static void im2col_ref(const lite::Tensor& input,
+                       const lite::Tensor* in_row,
+                       const lite::Tensor* in_col,
+                       const int kernel_h,
+                       const int kernel_w,
+                       const int stride_h,
+                       const int stride_w,
+                       const int input_channel,
+                       lite::Tensor* col) {
+  int batch = input.lod()[0].size() - 1;
+  const auto& bottom_offset = input.lod()[0];
+  // 2-D lod info.
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>();
+  const auto* bottom_data = input.data<float>();
+
+  int kernel_win_size = kernel_h * kernel_w;
+  int half_kernel_h = kernel_h / 2;
+  int half_kernel_w = kernel_w / 2;
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int top_im_x = (width - 1) / stride_w + 1;
+    int top_im_y = (height - 1) / stride_h + 1;
+    int top_x = top_im_y * top_im_x;
+    for (int z = 0; z < input_channel; ++z) {
+      int row_offset = kernel_win_size * z;
+      int im_offset = z * width * height;
+      for (int y = 0; y < height; y += stride_h) {
+        for (int x = 0; x < width; x += stride_w) {
+          int col_offset = x / stride_w + y / stride_h * top_im_x;
+          for (int ky = 0; ky < kernel_h; ++ky) {
+            for (int kx = 0; kx < kernel_w; ++kx) {
+              int im_y = y + ky - half_kernel_h;
+              int im_x = x + kx - half_kernel_w;
+              if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] =
+                    bottom_data[b_offset + im_offset + im_y * width + im_x];
+              } else {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void var_conv_2d_ref(const lite::Tensor* bottom,
+                            const lite::Tensor* w,
+                            const lite::Tensor* in_row,
+                            const lite::Tensor* in_col,
+                            const int kernel_h,
+                            const int kernel_w,
+                            const int stride_h,
+                            const int stride_w,
+                            const int input_channel,
+                            const int output_channel,
+                            lite::Tensor* top,
+                            lite::Tensor* col) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<X86Context>();
+
+  im2col_ref(*bottom,
+             in_row,
+             in_col,
+             kernel_h,
+             kernel_w,
+             stride_h,
+             stride_w,
+             input_channel,
+             col);
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>();
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, float>(context);
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+
+    blas.GEMM(false,
+              false,
+              output_channel,
+              top_im_size,
+              input_channel * kernel_h * kernel_w,
+              1.0,
+              w_data,
+              input_channel * kernel_h * kernel_w,
+              col_data + col_offset[b],
+              top_im_size,
+              0.0,
+              top_data + top_offset[b],
+              top_im_size);
+  }
+}
+
+TEST(var_conv_2d_x86, retrive_op) {
+  auto var_conv_2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "var_conv_2d");
+  ASSERT_FALSE(var_conv_2d.empty());
+  ASSERT_TRUE(var_conv_2d.front());
+}
+
+TEST(var_conv_2d_x86, init) {
+  VarConv2DCompute<float> var_conv_2d;
+  ASSERT_EQ(var_conv_2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(var_conv_2d.target(), TARGET(kX86));
+}
+
+TEST(var_conv_2d_x86, run_test) {
+  VarConv2DCompute<float> var_conv_2d;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::VarConv2DParam param;
+
+  lite::Tensor X, W, ROW, COLUMN;
+  lite::Tensor Out, Col;
+  int kernel_h, kernel_w;
+  int stride_h, stride_w;
+  int input_channel, output_channel;
+
+  output_channel = 5;
+  input_channel = 5;
+  kernel_h = 5;
+  kernel_w = 5;
+  stride_h = 1;
+  stride_w = 1;
+  std::vector<int64_t> w_dims_vec;
+  w_dims_vec.push_back(output_channel);
+  w_dims_vec.push_back(input_channel * kernel_h * kernel_w);
+  W.Resize(w_dims_vec);
+  auto* w_data = W.mutable_data<float>();
+  for (int i = 0; i < W.numel(); ++i) {
+    w_data[i] = i - 1.f;
+  }
+
+  std::vector<uint64_t> row_lod_vec{0, 10, 20};
+  LoD row_lod;
+  row_lod.push_back(row_lod_vec);
+  ROW.set_lod(row_lod);
+
+  std::vector<uint64_t> column_lod_vec{0, 10, 20};
+  LoD column_lod;
+  column_lod.push_back(column_lod_vec);
+  COLUMN.set_lod(column_lod);
+
+  int x_size = 0;
+  std::vector<uint64_t> x_lod_vec;
+  x_lod_vec.push_back(0);
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    x_lod_vec.push_back(height * width * input_channel);
+    x_size += height * width * input_channel;
+  }
+  std::vector<int64_t> x_dims_vec{x_size, 1};
+  LoD x_lod;
+  x_lod.push_back(x_lod_vec);
+  x_lod.push_back(row_lod_vec);
+  x_lod.push_back(column_lod_vec);
+  X.Resize(x_dims_vec);
+  X.set_lod(x_lod);
+  auto* x_data = X.mutable_data<float>();
+  for (int i = 0; i < X.numel(); ++i) {
+    x_data[i] = i % 20 * 1.f;
+  }
+
+  param.X = &X;
+  param.W = &W;
+  // param.ROW = &ROW;
+  // param.COLUMN = &COLUMN;
+  param.Out = &Out;
+  param.Col = &Col;
+  param.stride_h = stride_h;
+  param.stride_w = stride_w;
+  param.kernel_h = kernel_h;
+  param.kernel_w = kernel_w;
+  param.input_channel = input_channel;
+  param.output_channel = output_channel;
+  var_conv_2d.SetParam(param);
+  var_conv_2d.SetContext(std::move(ctx));
+  var_conv_2d.Run();
+
+  lite::Tensor top_ref, col_ref;
+  var_conv_2d_ref(&X,
+                  &W,
+                  &ROW,
+                  &COLUMN,
+                  kernel_h,
+                  kernel_w,
+                  stride_h,
+                  stride_w,
+                  input_channel,
+                  output_channel,
+                  &top_ref,
+                  &col_ref);
+
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(Out.data<float>()[i], top_ref.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < Col.numel(); ++i) {
+    EXPECT_NEAR(Col.data<float>()[i], col_ref.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(var_conv_2d, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index 2c758cf9507087fb53d476ff86a64707e0c6249b..d6fc806ad4541a735ea4ef6eff292076836ac5e7 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/conv_op.h"
 #include "lite/backends/xpu/builder.h"
 #include "lite/kernels/xpu/bridges/registry.h"
 
@@ -46,14 +47,36 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
   std::vector<int64_t> output_shape({bs, oc});
   for (size_t i = 0; i < 2; i++) {
     const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
     output_shape.push_back(
-        (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1);
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+            strides[i] +
+        1);
   }
   DDim output_dims(output_shape);
 
diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc
index ebdb67bd0d2801a9036696f52790f7104279b0cb..70929ffcd596c299b6d8975c2bfbb8941fc67525 100644
--- a/lite/kernels/xpu/bridges/conv_op_test.cc
+++ b/lite/kernels/xpu/bridges/conv_op_test.cc
@@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
   int stride_h = strides[0];
   int dila_w = dilations[1];
   int dila_h = dilations[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int pad_h = paddings[0];
   int batch_size = input_dims[0];
   int in_ch_size = input_dims[1];
@@ -175,7 +175,8 @@ void test_conv(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc
index ed5f922d59b5ca5e387076c9a533c4b4c251cc87..7efc6b464c00c945c71c8c5689e18823cde10f97 100644
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
@@ -60,7 +60,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -162,7 +162,8 @@ void test_pool(int bs,
   opdesc.SetAttr("global_pooling", global_pooling);
   opdesc.SetAttr("exclusive", exclusive);
   opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
   opdesc.SetAttr("ceil_mode", ceil_mode);
 
   // create and convert op to XPU model, then run it on XPU
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 13b6cb5b77d00a2a5f733a0015dec4dbebc088b7..ed3f45c598e74a0450454c15ad0cd9ad09266f8e 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir,
       SaveParamNaive(path, exec_scope, var.Name());
     }
   }
-  VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully";
+  LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
 }
 #endif
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 49badbb27b00979117f9e75d1c66763a7be99837..7c4048c204b0889f9a9bd72a7e94da3777441d37 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -2,11 +2,10 @@ set(op_DEPS tensor op op_params scope memory)
 
 lite_cc_library(op_params SRCS op_params.cc DEPS tensor any)
 
+# 1.baisc ops used in basic models
 add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS})
 add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS})
 add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS})
-add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
-add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
 add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS})
 add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS})
 add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS})
@@ -15,57 +14,64 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
 add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
 add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
 add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
 add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
-add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS})
-add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
-add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
 add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
 add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
-#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS})
-add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS})
-add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS})
 add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
 add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
 add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
 add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})
-add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS})
-add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS})
-add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS})
-add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
-add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS})
 add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS})
-add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
-add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
 add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS})
 add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS})
-add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS})
-add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS})
 add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS})
-add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS})
 add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS})
 add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
-add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
-add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS})
-add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS})
-add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
-add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
 add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
-add_operator(unsqueeze_op_lite extra SRCS unsqueeze_op.cc DEPS ${op_DEPS})
-add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS})
+add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS})
+add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS})
+add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS})
+add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS})
+add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS})
+add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS})
+add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
+add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
+add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
+add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
+add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
+add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
+add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
+
+# 2.basic ops not used in basic models
+add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
+add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS})
+add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
+add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
+add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
+
+# 3.extra ops
+add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
+add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
+add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
+add_operator(uniform_random_op extra SRCS uniform_random_op.cc DEPS ${op_DEPS})
+add_operator(axpy_op extra SRCS axpy_op.cc DEPS ${op_DEPS})
+add_operator(gru_unit_op extra SRCS gru_unit_op.cc DEPS ${op_DEPS})
+add_operator(gru_op extra SRCS gru_op.cc DEPS ${op_DEPS})
+add_operator(layout_once_op extra SRCS layout_once_op.cc DEPS ${op_DEPS})
+add_operator(density_prior_box_op extra SRCS density_prior_box_op.cc DEPS ${op_DEPS})
+add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
+add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
+add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
+add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
-add_operator(reduce_mean_op extra SRCS reduce_mean_op.cc DEPS ${op_DEPS})
-add_operator(stack_op extra SRCS stack_op.cc DEPS ${op_DEPS})
-add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS})
-add_operator(affine_channel_op extra SRCS affine_channel_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
 add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
 add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
@@ -73,16 +79,26 @@ add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
 add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
-add_operator(range_op extra SRCS range_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
+
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
+add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
 add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
+add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS})
+add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS})
+add_operator(search_grnn_op_lite extra SRCS search_grnn_op.cc DEPS ${op_DEPS})
+add_operator(search_seq_softmax_op_lite extra SRCS search_seq_softmax_op.cc DEPS ${op_DEPS})
+add_operator(sequence_concat_op_lite extra SRCS sequence_concat_op.cc DEPS ${op_DEPS})
+add_operator(var_conv_2d_op_lite extra SRCS var_conv_2d_op.cc DEPS ${op_DEPS})
+add_operator(attention_padding_mask_op_lite extra SRCS attention_padding_mask_op.cc DEPS ${op_DEPS})
+add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
+add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
 add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
@@ -106,7 +122,11 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
 add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
 add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS})
 add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
-
+# for content-dnn specific
+add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS})
+add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
+add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
+add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
@@ -122,8 +142,8 @@ if (NOT LITE_WITH_X86)
     lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory)
     lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope)
     lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm)
-    lite_cc_test(test_fusion_elementwise_activation_ops
-                SRCS fusion_elementwise_activation_ops_test.cc
-                DEPS fusion_elementwise_activation_ops memory)
     lite_cc_test(test_transpose_op SRCS transpose_op_test.cc DEPS transpose_op memory)
+    lite_cc_test(test_fusion_elementwise_activation_ops
+                 SRCS fusion_elementwise_activation_ops_test.cc
+                 DEPS fusion_elementwise_activation_ops memory)
 endif()
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index c3c5de311f41f88fbeed4b03f9bfd618cf51c3b3..6ddcee0cb9e7fb0ef6df8a8c03d85fe406590b9d 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -117,6 +117,7 @@ REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
 
diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a88df0e7a902c6cac63eb77377bb0b49ee30c9b3
--- /dev/null
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/attention_padding_mask_op.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool AttentionPaddingMaskOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.pad_begin);
+  return true;
+}
+
+bool AttentionPaddingMaskOp::InferShape() const {
+  auto src_len = param_.X->lod()[0][1];
+  CHECK_EQ(src_len, param_.X->dims()[1])
+      << "Mismatch source length, expect: " << src_len
+      << ", get: " << param_.X->lod()[0][1];
+  auto att_batch = param_.X->lod()[0].size() - 1;
+  auto src_batch = param_.Y->lod()[0].size() - 1;
+  CHECK_EQ(att_batch % src_batch, 0)
+      << "Mismatch batch size, bottom0: " << att_batch
+      << ", bottom1: " << src_batch;
+
+  param_.pad_begin->Resize({static_cast<int64_t>(src_batch)});
+  param_.Out->Resize(param_.X->dims());
+  param_.Out->set_lod(param_.X->lod());
+
+  return true;
+}
+
+bool AttentionPaddingMaskOp::AttachImpl(const cpp::OpDesc &op_desc,
+                                        lite::Scope *scope) {
+  param_.X = scope->FindTensor(op_desc.Input("X").front());
+  param_.Y = scope->FindTensor(op_desc.Input("Y").front());
+  param_.Out = scope->FindMutableTensor(op_desc.Output("Out").front());
+  param_.pad_begin =
+      scope->FindMutableTensor(op_desc.Output("pad_begin").front());
+
+  param_.pad_id = op_desc.GetAttr<int>("pad_id");
+  param_.mask = op_desc.GetAttr<float>("mask");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(attention_padding_mask,
+                 paddle::lite::operators::AttentionPaddingMaskOp);
+REGISTER_LITE_OP(search_attention_padding_mask,
+                 paddle::lite::operators::AttentionPaddingMaskOp);
diff --git a/lite/operators/attention_padding_mask_op.h b/lite/operators/attention_padding_mask_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..894d68f6226720139aee07274d4ac5cf660749f1
--- /dev/null
+++ b/lite/operators/attention_padding_mask_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class AttentionPaddingMaskOp : public OpLite {
+ public:
+  AttentionPaddingMaskOp() {}
+
+  explicit AttentionPaddingMaskOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "attention_padding_mask"; }
+
+ private:
+  mutable AttentionPaddingMaskParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index ceca1a61ce3457ed0a2c25541d02bd868c380b3b..6dab55ff3b6c55e7763484d78c6c36bf85017128 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -39,56 +39,38 @@ bool ConvOpLite::CheckShape() const {
   return true;
 }
 
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  // CHECK_GT_OR_FALSE(output_size, 0);
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
 
   return output_size;
 }
 
-inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
-                                     std::vector<int>* dilations,
-                                     const std::vector<int>& strides,
-                                     const std::string padding_algorithm,
-                                     const lite::DDim data_dims,
-                                     const lite::DDim& ksize) {
-  // when padding_desc is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (size_t i = 0; i < strides.size(); ++i) {
-      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
-                   (int64_t)0);
-      // pad
-      *(paddings->begin() + i) = pad_sum / 2;
-      // dilation
-      *(dilations->begin() + i) = 1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto& it : *paddings) {
-      it = 0;
-    }
-  }
-}
-
 bool ConvOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
-  UpdatePaddingAndDilation(&param_.paddings,
-                           &param_.dilations,
+  UpdatePaddingAndDilation(param_.paddings.get(),
+                           param_.dilations.get(),
                            param_.strides,
                            padding_algorithm_,
                            in_dims,
                            filter_dims);
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
   for (size_t i = 0; i < param_.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 2],
                                           filter_dims[i + 2],
-                                          param_.dilations[i],
-                                          param_.paddings[i],
+                                          dilations[i],
+                                          paddings[i * 2],
+                                          paddings[i * 2 + 1],
                                           param_.strides[i]));
   }
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index e764819f6308e9723f185bc73979000af7f72b5b..3ab34bc1d0bd631b0641cebd3db29cfff9316bb0 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -47,9 +48,10 @@ class ConvOpLite : public OpLite {
     param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     param_.groups = op_desc.GetAttr<int>("groups");
-    param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    param_.dilations = std::make_shared<std::vector<int>>(dilations);
 
     // optional params
     std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
@@ -109,12 +111,24 @@ class ConvOpLite : public OpLite {
         param_.output_scale = op_desc.GetAttr<float>("output_scale");
       }
     }
+
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < param_.strides.size(); ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the input size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
     return true;
   }
 
-  void AttachKernel(KernelBase* kernel) override { 
-      kernel->SetParam(param_);
-  }
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 
   std::string DebugString() const override { return "conv2d"; }
 
@@ -123,6 +137,34 @@ class ConvOpLite : public OpLite {
   std::string padding_algorithm_{""};
 };
 
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index fb6b431fff8ab20dd1a6d1abc8aff7443771ee2f..a472ae07455dd1b10688a4b033358bba70d8f34f 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/operators/conv_transpose_op.h"
+#include <memory>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 
@@ -32,24 +32,75 @@ bool ConvTransposeOpLite::CheckShape() const {
 
   CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
   CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size());
 
   CHECK_OR_FALSE(in_dims[1] % param_.groups == 0);
+  CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
   return true;
 }
 
+inline int ConvTransposeOutputSize(int input_size,
+                                   int filter_size,
+                                   int dilation,
+                                   int pad_left,
+                                   int pad_right,
+                                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size - 1) * stride - pad_left - pad_right + dkernel;
+
+  return output_size;
+}
+
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 bool ConvTransposeOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
+  UpdatePaddingAndDilation(param_.paddings.get(),
+                           param_.dilations.get(),
+                           param_.strides,
+                           padding_algorithm_,
+                           in_dims,
+                           filter_dims);
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
+
   std::vector<int64_t> output_shape;
   output_shape.push_back(in_dims[0]);
   output_shape.push_back(filter_dims[1] * param_.groups);
-  for (int i = 0; i < param_.strides.size(); i++) {
-    int kernel_extent = param_.dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    int output_len = (in_dims[i + 2] - 1) * param_.strides[i] + kernel_extent -
-                     2 * param_.paddings[i];
-    output_shape.push_back(output_len);
+  for (size_t i = 0; i < param_.strides.size(); ++i) {
+    output_shape.push_back(ConvTransposeOutputSize(in_dims[i + 2],
+                                                   filter_dims[i + 2],
+                                                   dilations[i],
+                                                   paddings[i * 2],
+                                                   paddings[i * 2 + 1],
+                                                   param_.strides[i]));
   }
 
   // Set output dims
@@ -58,8 +109,8 @@ bool ConvTransposeOpLite::InferShape() const {
 }
 
 // TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                     lite::Scope *scope) {
+bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                     lite::Scope* scope) {
   auto X = op_desc.Input("Input").front();
   auto Filter = op_desc.Input("Filter").front();
   auto Out = op_desc.Output("Output").front();
@@ -68,9 +119,27 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
   param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
   param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
   param_.groups = op_desc.GetAttr<int>("groups");
-  param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+
+  if (op_desc.HasAttr("padding_algorithm")) {
+    padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+  }
+  // 2-pad to 4-pad
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    if (paddings.size() != 4L) {
+      LOG(FATAL)
+          << "Paddings size should be the same or twice as the input size.";
+    }
+  }
+  param_.paddings = std::make_shared<std::vector<int>>(paddings);
+  param_.dilations = std::make_shared<std::vector<int>>(dilations);
 
   // optional params
   std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
@@ -81,7 +150,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
       auto bias_var = scope->FindVar(bias_arguments.front());
       if (bias_var != nullptr) {
         param_.bias =
-            const_cast<lite::Tensor *>(&(bias_var->Get<lite::Tensor>()));
+            const_cast<lite::Tensor*>(&(bias_var->Get<lite::Tensor>()));
       }
     }
   }
diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h
index d8b64c78efdcc00b5842c90336ce195b55d59370..fb25c022f974ad195bf72b19cb9b459b2d11d5f2 100644
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -44,6 +44,7 @@ class ConvTransposeOpLite : public OpLite {
 
  private:
   mutable ConvParam param_;
+  std::string padding_algorithm_{""};
 };
 
 }  // namespace operators
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
index 6e4bee4da87095245d90c6af5db98d2e95d7d3d8..acf9701cbd750e83ba51f25c66064c2dd7781db6 100644
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -29,6 +29,12 @@ class FillConstantOp : public OpLite {
   }
 
   bool InferShape() const override {
+    lite::Tensor* shape_tensor_ = param_.shape_tensor;
+    if (param_.shape.empty() && shape_tensor_ != nullptr) {
+      param_.Out->Resize(shape_tensor_->dims());
+      return true;
+    }
+
     param_.Out->Resize(param_.shape);
     return true;
   }
@@ -41,6 +47,23 @@ class FillConstantOp : public OpLite {
     param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
     param_.value = opdesc.GetAttr<float>("value");
     param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
+    param_.shape_tensor = nullptr;
+    param_.shape_tensor_list = {};
+
+    std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+    if (std::find(input_arg_names.begin(),
+                  input_arg_names.end(),
+                  "ShapeTensor") != input_arg_names.end()) {
+      auto args = opdesc.Input("ShapeTensor");
+      auto* var = scope->FindVar(args.front());
+      param_.shape_tensor = var->GetMutable<lite::Tensor>();
+    }
+    if (opdesc.HasAttr("ShapeTensorList")) {
+      auto args = opdesc.Input("ShapeTensorList");
+      auto* var = scope->FindVar(args.front());
+      param_.shape_tensor_list =
+          *(var->GetMutable<std::vector<lite::Tensor*>>());
+    }
     return true;
   }
 
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
index b98240ba4f255377c0ac661950a45bef0a7d0516..936da73d89007f4f6dd36fa770df537996c40a51 100644
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -45,23 +45,42 @@ bool InterpolateOp::InferShape() const {
   int out_h;
   int out_w;
 
-  if (OutSize != nullptr) {
-    auto outsize_data = OutSize->data<int>();
-    int h_out = outsize_data[0];  // HW
-    int w_out = outsize_data[1];  // HW
-    param_.Out->Resize({n, c, h_out, w_out});
+  auto SizeTensor = param_.SizeTensor;
+  if (!SizeTensor.empty()) {
+    CHECK(SizeTensor.size() == 2)
+        << "Input(SizeTensor)'size of Op(interpolate) must be 2. "
+           "Attr(out_shape)'s length must be 2 for 4-D input tensor.";
+    out_h = param_.out_h;
+    out_w = param_.out_w;
+    param_.Out->Resize({n, c, out_h, out_w});
+    return true;
+  }
+
+  auto Scale = param_.Scale;
+  if (Scale) {
+    auto scale_dims = Scale->dims();
+    CHECK(scale_dims.size() == 1) << "Scale's dimension size must be 1.";
+    out_h = -1;
+    out_w = -1;
   } else {
-    if (0 >= param_.out_h && 0 >= param_.out_w) {
-      out_h = h * param_.scale;
-      out_w = w * param_.scale;
+    auto scale = param_.scale;
+    if (scale > 0) {
+      out_h = static_cast<int>(h * scale);
+      out_w = static_cast<int>(w * scale);
       out_h = out_h > 0 ? out_h : -1;
       out_w = out_w > 0 ? out_w : -1;
     } else {
       out_h = param_.out_h;
       out_w = param_.out_w;
     }
-    param_.Out->Resize({n, c, out_h, out_w});
   }
+
+  if (OutSize != nullptr) {
+    auto out_lod = param_.Out->mutable_lod();
+    *out_lod = param_.X->lod();
+  }
+  param_.Out->Resize({n, c, out_h, out_w});
+
   return true;
 }
 
@@ -76,6 +95,24 @@ bool InterpolateOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   } else {
     param_.OutSize = nullptr;
   }
+
+  if (op_desc.HasInput("SizeTensor")) {
+    auto size_tensor = op_desc.Input("SizeTensor");
+    for (auto var : size_tensor) {
+      param_.SizeTensor.push_back(
+          scope->FindVar(var)->GetMutable<lite::Tensor>());
+    }
+  }
+
+  if (op_desc.HasInput("Scale")) {
+    auto scale_var_names = op_desc.Input("Scale");
+    if (scale_var_names.size() > 0) {
+      param_.Scale =
+          scope->FindVar(scale_var_names.front())->GetMutable<lite::Tensor>();
+    }
+  } else {
+    param_.Scale = nullptr;
+  }
   auto Out = op_desc.Output("Out").front();
   param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
   param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
diff --git a/lite/operators/lookup_table_v2_op.cc b/lite/operators/lookup_table_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c783695163b1d95964ac1a8a9d79d7167811261a
--- /dev/null
+++ b/lite/operators/lookup_table_v2_op.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/lookup_table_v2_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool LookupTableV2OpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.W)
+  CHECK_OR_FALSE(param_.Ids)
+  CHECK_OR_FALSE(param_.Out)
+
+  auto table_dims = param_.W->dims();
+
+  CHECK_EQ_OR_FALSE(table_dims.size(), 2)
+
+  return true;
+}
+
+bool LookupTableV2OpLite::InferShape() const {
+  auto table_dims = param_.W->dims();
+  auto ids_dims = param_.Ids->dims();
+
+  std::vector<int64_t> out_dims;
+  for (int i = 0; i < ids_dims.size(); ++i) {
+    out_dims.push_back(ids_dims[i]);
+  }
+  out_dims.push_back(table_dims[1]);
+  param_.Out->Resize(lite::DDim{out_dims});
+  param_.Out->set_lod(param_.Ids->lod());
+  return true;
+}
+
+bool LookupTableV2OpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                     lite::Scope *scope) {
+  auto input = op_desc.Input("W").front();
+  auto ids = op_desc.Input("Ids").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.W = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.Ids = scope->FindVar(ids)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(lookup_table_v2, paddle::lite::operators::LookupTableV2OpLite)
diff --git a/lite/operators/lookup_table_v2_op.h b/lite/operators/lookup_table_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dabff3f0cac75cb70cde6eb6e95df34dc36901fe
--- /dev/null
+++ b/lite/operators/lookup_table_v2_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class LookupTableV2OpLite : public OpLite {
+ public:
+  LookupTableV2OpLite() {}
+  explicit LookupTableV2OpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "LookupTable"; }
+
+ private:
+  mutable LookupTableParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc
index 34b00653f91d03f8e661fac56b5931d928be15b2..aff3e5af5566771411acf20736fdbec703f5def9 100644
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
@@ -37,11 +37,13 @@ bool LrnOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto Out_name = opdesc.Output("Out").front();
   param_.X = GetVar<lite::Tensor>(scope, X_name);
   param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  param_.local_size = opdesc.GetAttr<int>("local_size");
+  param_.n = opdesc.GetAttr<int>("n");
   param_.alpha = opdesc.GetAttr<float>("alpha");
   param_.beta = opdesc.GetAttr<float>("beta");
   param_.k = opdesc.GetAttr<float>("k");
-  param_.norm_region = opdesc.GetAttr<std::string>("norm_region");
+  if (opdesc.HasAttr("norm_region")) {
+    param_.norm_region = opdesc.GetAttr<std::string>("norm_region");
+  }
   return true;
 }
 
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8095a94bf75cd5d6d9087509449c159056ebc28
--- /dev/null
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/match_matrix_tensor_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MatchMatrixTensorOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.y);
+  CHECK_OR_FALSE(param_.w);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.tmp);
+
+  DDim x_dims = param_.x->dims();
+  DDim y_dims = param_.y->dims();
+  DDim w_dims = param_.w->dims();
+  int dim_t = param_.dim_t;
+
+  CHECK_OR_FALSE(x_dims.size() == 2);
+  CHECK_OR_FALSE(y_dims.size() == 2);
+  CHECK_OR_FALSE(w_dims.size() == 3);
+
+  CHECK_OR_FALSE(x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] &&
+                 w_dims[1] == dim_t);
+
+  return true;
+}
+
+bool MatchMatrixTensorOpLite::InferShape() const {
+  const Tensor* x = param_.x;
+  const Tensor* y = param_.y;
+  DDim x_dims = param_.x->dims();
+  DDim y_dims = param_.y->dims();
+  DDim w_dims = param_.w->dims();
+  int dim_t = param_.dim_t;
+
+  const auto& x_lod = x->lod();
+  CHECK_OR_FALSE(!x_lod.empty());
+  const auto& x_lod_0 = x_lod[0];
+  CHECK_OR_FALSE(x_lod_0.size() >= 2);
+  CHECK_OR_FALSE(x_dims[0] == x_lod_0.back());
+
+  const auto& y_lod = y->lod();
+  CHECK_OR_FALSE(!y_lod.empty());
+  const auto& y_lod_0 = y_lod[0];
+  CHECK_OR_FALSE(y_lod_0.size() >= 2);
+  CHECK_OR_FALSE(y_dims[0] == y_lod_0.back());
+
+  CHECK_OR_FALSE(x_lod_0.size() == y_lod_0.size());
+
+  int out_dim_0 = 0;
+  for (size_t i = 1; i < x_lod_0.size(); i++) {
+    int x_len = x_lod_0[i] - x_lod_0[i - 1];
+    int y_len = y_lod_0[i] - y_lod_0[i - 1];
+    out_dim_0 += (x_len * y_len);
+  }
+  out_dim_0 *= dim_t;
+  int tmp_dim_0 = x_dims[0] * dim_t * x_dims[1];
+
+  param_.out->Resize({out_dim_0, 1});
+  param_.tmp->Resize({tmp_dim_0, 1});
+  return true;
+}
+
+bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                         lite::Scope* scope) {
+  auto x = op_desc.Input("X").front();
+  auto w = op_desc.Input("W").front();
+  auto y = op_desc.Input("Y").front();
+  auto out = op_desc.Output("Out").front();
+  auto tmp = op_desc.Output("Tmp").front();
+
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.w = scope->FindVar(w)->GetMutable<lite::Tensor>();
+  param_.y = scope->FindVar(y)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.tmp = scope->FindVar(tmp)->GetMutable<lite::Tensor>();
+
+  param_.dim_t = op_desc.GetAttr<int32_t>("dim_t");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(match_matrix_tensor,
+                 paddle::lite::operators::MatchMatrixTensorOpLite);
diff --git a/lite/operators/match_matrix_tensor_op.h b/lite/operators/match_matrix_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..404183ea5bda3c35ba8b833853bc0005d60b9f7d
--- /dev/null
+++ b/lite/operators/match_matrix_tensor_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MatchMatrixTensorOpLite : public OpLite {
+ public:
+  MatchMatrixTensorOpLite() {}
+
+  explicit MatchMatrixTensorOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "match_matrix_tensor"; }
+
+ private:
+  mutable MatchMatrixTensorParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 8609f178886808f5dedf2de86e7cf7941c4a4c5d..4f0c707484f6a66148dabc80968665c1d38de445 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -89,11 +90,21 @@ struct FcParam {
   WITH_INT8_CONFIG
 };
 
+struct SearchSeqFcParam {
+  lite::Tensor* x{nullptr};
+  lite::Tensor* w{nullptr};
+  lite::Tensor* b{nullptr};
+  lite::Tensor* out{nullptr};
+  int out_size;
+};
+
 // For Interpolate Op
 struct InterpolateParam {
   lite::Tensor* X{};
   lite::Tensor* OutSize{};
   lite::Tensor* Out{};
+  std::vector<const lite::Tensor*> SizeTensor;
+  lite::Tensor* Scale{};
 
   float scale{0.f};
   int out_h{-1};
@@ -101,6 +112,7 @@ struct InterpolateParam {
   bool align_corners{true};
   int align_mode{1};
   std::string interp_method{"Nearest"};
+  DataLayoutType data_layout{DATALAYOUT(kNCHW)};
 };
 
 // For Mul Op
@@ -242,9 +254,19 @@ struct ConvParam {
   lite::Tensor* residualData{nullptr};
   lite::Tensor* output{};
   std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
   int groups{1};
-  std::vector<int> dilations{1, 1};
+  /* dilations type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> dilations;
   bool fuse_relu_before_depthwise_conv{false};
   bool use_mkldnn{false};
   bool fuse_relu{false};  // only used in mkldnn kernel
@@ -291,7 +313,12 @@ struct PoolParam {
   bool global_pooling{
       false};  // if true, knernel size and paddings will be ignored
   std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
   bool exclusive{true};
   bool adaptive{false};
   bool ceil_mode{false};
@@ -317,6 +344,9 @@ struct DropoutParam {
 struct SplitParam {
   lite::Tensor* x{};
   std::vector<lite::Tensor*> output{};
+  lite::Tensor* axis_tensor;
+  std::vector<lite::Tensor*> sections_tensor_list{};
+
   int axis{-1};
   int num{0};
   std::vector<int> sections;
@@ -378,6 +408,9 @@ struct MeanGradParam {
 struct FillConstantParam {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
   std::vector<int64_t> shape{};
+  lite::Tensor* shape_tensor;
+  std::vector<lite::Tensor*> shape_tensor_list{};
+
   float value{0.0f};
   // useless for x86, keep it for compatibility
   bool force_cpu{false};
@@ -511,8 +544,8 @@ struct GRUUnitParam {
 struct LrnParam {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
-  int local_size{5};
-  float alpha{1.};
+  int n{5};
+  float alpha{1e-4};
   float beta{0.75};
   float k{1.};
   std::string norm_region{"AcrossChannels"};
@@ -729,6 +762,14 @@ struct SequencePoolParam {
 #endif
 };
 
+struct SearchGroupPaddingParam {
+  lite::Tensor* x{};
+  lite::Tensor* out_emb_padding{};
+  lite::Tensor* out_new{};
+  lite::Tensor* out_padding{};
+  int pad_id;
+};
+
 struct SequenceReshapeParam {
   lite::Tensor* x{};
   lite::Tensor* output{};
@@ -748,6 +789,32 @@ struct SequenceExpandAsParam {
   lite::Tensor* out{nullptr};
 };
 
+struct SequenceReverseParam {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+};
+
+struct SequenceConcatParam {
+  std::vector<lite::Tensor*> X{};
+  lite::Tensor* Out{};
+};
+
+struct AttentionPaddingMaskParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  int pad_id;
+  float mask;
+  lite::Tensor* Out{};
+  lite::Tensor* pad_begin{};
+};
+
+struct SequenceArithmeticParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  int op_type{1};
+  lite::Tensor* Out{};
+};
+
 struct ReduceMaxParam {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
@@ -776,6 +843,22 @@ struct ReduceParam {
   bool reduce_all{false};
 };
 
+struct VarConv2DParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* ROW{};
+  const lite::Tensor* COLUMN{};
+  const lite::Tensor* W{};
+  lite::Tensor* Out{};
+  lite::Tensor* Col{};
+
+  int input_channel;
+  int output_channel;
+  int stride_h;
+  int stride_w;
+  int kernel_h;
+  int kernel_w;
+};
+
 /// ----------------------- shape operators ----------------------
 struct ShapeParam {
   const lite::Tensor* X{};
@@ -856,7 +939,7 @@ struct UnsqueezeParam {
   lite::Tensor* XShape{};
   std::vector<int> axes{};
   const lite::Tensor* axes_tensor{};
-  std::vector<lite::Tensor>* axes_tensor_vct{};
+  std::vector<const lite::Tensor*> axes_tensor_vct{};
 };
 
 /// ----------------------- expand operators ----------------------
@@ -922,6 +1005,57 @@ struct AssignValueParam {
   lite::Tensor* Out{};
 };
 
+/// --------------- sequence_topk_avg_pooling operators ------------------
+struct SequenceTopkAvgPoolingParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* ROW{};
+  const lite::Tensor* COLUMN{};
+  lite::Tensor* Out{};
+  lite::Tensor* pos{};
+  int channel_num{};
+  std::vector<int> topks{};
+};
+
+/// --------------- search_fc operators ------------------
+struct SearchFcParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* W{};
+  const lite::Tensor* b{};
+  lite::Tensor* Out{};
+  int out_size{};
+};
+/// --------------------- match_matrix_tensor operators --------------------
+struct MatchMatrixTensorParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* y{};
+  const lite::Tensor* w{};
+  lite::Tensor* out{};
+  lite::Tensor* tmp{};
+
+  int dim_t;
+};
+
+/// --------------------- search_seq_depadding operators --------------------
+struct SearchSeqDepaddingParam {
+  const lite::Tensor* pad{};
+  const lite::Tensor* src{};
+  lite::Tensor* out{};
+};
+
+/// --------------------- search_grnn operators --------------------
+struct SearchGrnnParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* wi{};
+  const lite::Tensor* wh{};
+  int num_input;
+  int num_hidden;
+
+  lite::Tensor* out{};
+  lite::Tensor* tmp_buffer{};
+  lite::Tensor* idx_sorted_by_width{};
+  lite::Tensor* layout_input{};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
index 1ebbc059b76572886f5ff7c8ce1e32b593070fa0..c6f6eed28f8cdb5f080b6f4367a1b88b1dbc0701 100644
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/pool_op.h"
+#include <algorithm>
 #include "lite/core/op_registry.h"
 
 namespace paddle {
@@ -26,7 +27,7 @@ bool PoolOpLite::CheckShape() const {
   const auto& x_dims = param_.x->dims();
   const auto& ksize = param_.ksize;
   const auto& strides = param_.strides;
-  const auto& paddings = param_.paddings;
+  const auto& paddings = *param_.paddings;
 
   // "Pooling intput should be 4-D or 5-D tensor."
   CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
@@ -34,20 +35,27 @@ bool PoolOpLite::CheckShape() const {
   CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
   // Strides size and pooling size should be the same.
   CHECK_OR_FALSE(ksize.size() == strides.size());
-  // Paddings size and pooling size should be the same.
-  CHECK_OR_FALSE(ksize.size() == paddings.size());
+  // Paddings size must be 4.
+  CHECK_OR_FALSE(paddings.size() == 4L);
 
   return true;
 }
 
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
@@ -55,14 +63,21 @@ int PoolOutputSize(
 bool PoolOpLite::InferShape() const {
   const auto x_dims = param_.x->dims();
   std::vector<int>& ksize = param_.ksize;
+  // dynamic update 4-pad
+  UpdatePadding(param_.paddings.get(),
+                param_.global_pooling,
+                param_.adaptive,
+                padding_algorithm_,
+                x_dims,
+                param_.strides,
+                ksize);
   if (param_.global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_.paddings[i] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
-
+  auto paddings = *param_.paddings;
   std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
   if (param_.adaptive) {
     output_shape.insert(
@@ -71,15 +86,14 @@ bool PoolOpLite::InferShape() const {
     for (size_t i = 0; i < param_.ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_.ksize[i],
-                                            param_.paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_.strides[i],
                                             param_.ceil_mode));
     }
   }
   param_.output->Resize(lite::DDim(output_shape));
 
-  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  // ctx->ShareLoD("X", "Out");
   return true;
 }
 
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index aecec4c61955cecd67f485662feb1a937681c165..c44875ff95b554ca92cf5288597a5bdaf2cb1bf8 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -51,7 +53,7 @@ class PoolOpLite : public OpLite {
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
     param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
 
     if (op_desc.HasAttr("exclusive")) {
       param_.exclusive = op_desc.GetAttr<bool>("exclusive");
@@ -65,7 +67,23 @@ class PoolOpLite : public OpLite {
     if (op_desc.HasAttr("use_quantizer")) {
       param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
     }
-    // param_.data_format = op_desc.GetAttr<bool>("data_format");
+    if (op_desc.HasAttr("padding_algorithm")) {
+      padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+    }
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < 2L; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the inputs size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
+
     return true;
   }
 
@@ -75,8 +93,42 @@ class PoolOpLite : public OpLite {
 
  private:
   mutable PoolParam param_;
+  std::string padding_algorithm_{""};
 };
 
+inline void UpdatePadding(std::vector<int> *paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const lite::DDim data_dims,
+                          const std::vector<int> &strides,
+                          const std::vector<int> &ksize) {
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                   (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/search_aligned_mat_mul_op.cc b/lite/operators/search_aligned_mat_mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43a276e3c7a2f7481ade2ee18c1446593f7c5f43
--- /dev/null
+++ b/lite/operators/search_aligned_mat_mul_op.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_aligned_mat_mul_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchAlignedMatMulOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+
+  return true;
+}
+
+bool SearchAlignedMatMulOpLite::InferShape() const {
+  const auto x_dims = param_.X->dims();
+  const auto y_dims = param_.Y->dims();
+  const auto& x_lod = param_.X->lod();
+  const auto& y_lod = param_.Y->lod();
+  bool x_transpose = param_.transpose_X;
+  bool y_transpose = param_.transpose_Y;
+
+  CHECK_EQ(x_dims.size(), 2) << "X should be 2-D tensor";
+  CHECK_EQ(y_dims.size(), 2) << "Y should be 2-D tensor";
+  CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+  CHECK(!y_lod.empty()) << "The Input(Y) must hold lod info.";
+
+  const auto& x_lod_0 = x_lod[0];
+  const auto& y_lod_0 = y_lod[0];
+  CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+  CHECK_GE(y_lod_0.size(), 2) << "The Input(Y)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(y_dims[0], static_cast<int64_t>(y_lod_0.back()))
+      << "The Input(Y)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(x_lod_0.size(), y_lod_0.size())
+      << "The Length of X and Y must be equal.";
+
+  int seq_num = x_lod_0.size() - 1;
+  int x_inner_size = x_dims[1];
+  int y_inner_size = y_dims[1];
+  int x_batch_size = x_lod_0[1];
+  int y_batch_size = y_lod_0[1];
+  int M = x_transpose ? x_inner_size : x_batch_size;
+  int N = y_transpose ? y_batch_size : y_inner_size;
+  int X_K = x_transpose ? x_batch_size : x_inner_size;
+  int Y_K = y_transpose ? y_inner_size : y_batch_size;
+  CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+
+  LoD out_lod;
+  std::vector<uint64_t> out_lod_0(seq_num + 1);
+  out_lod_0[0] = 0;
+  for (int i = 0; i < seq_num; i++) {
+    out_lod_0[i + 1] = out_lod_0[i] + M;
+  }
+  out_lod.push_back(out_lod_0);
+  DDim out_dims(
+      {static_cast<int64_t>(out_lod_0.back()), static_cast<int64_t>(N)});
+  param_.Out->set_lod(out_lod);
+  param_.Out->Resize(out_dims);
+  return true;
+}
+
+bool SearchAlignedMatMulOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                           lite::Scope* scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("Y").empty());
+  CHECK(!op_desc.Output("Out").empty());
+  auto X = op_desc.Input("X").front();
+  auto Y = op_desc.Input("Y").front();
+  auto Out = op_desc.Output("Out").front();
+  param_.X = GetVar<lite::Tensor>(scope, X);
+  param_.Y = GetVar<lite::Tensor>(scope, Y);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out);
+  param_.transpose_X = op_desc.GetAttr<bool>("transpose_X");
+  param_.transpose_Y = op_desc.GetAttr<bool>("transpose_Y");
+  param_.alpha = op_desc.GetAttr<float>("alpha");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_aligned_mat_mul,
+                 paddle::lite::operators::SearchAlignedMatMulOpLite);
diff --git a/lite/operators/search_aligned_mat_mul_op.h b/lite/operators/search_aligned_mat_mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7321b7e9d15331e6aad36364436a99d3d4089c8c
--- /dev/null
+++ b/lite/operators/search_aligned_mat_mul_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchAlignedMatMulOpLite : public OpLite {
+ public:
+  SearchAlignedMatMulOpLite() {}
+
+  explicit SearchAlignedMatMulOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  std::string DebugString() const override { return "search_aligned_mat_mul"; }
+
+ private:
+  mutable MatMulParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e77e361624e681aa93e36610674df0e1f9a13af
--- /dev/null
+++ b/lite/operators/search_fc_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_fc_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchFcOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.W);
+  CHECK_OR_FALSE(param_.b);
+  CHECK_OR_FALSE(param_.Out);
+
+  auto x_dims = param_.X->dims();
+  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) should be 2.";
+  auto w_dims = param_.W->dims();
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  auto b_dims = param_.b->dims();
+  CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+  CHECK_EQ(w_dims[1], x_dims[1]) << "wrong shape: w_dims[1] != x_dims[1]";
+  return true;
+}
+
+bool SearchFcOpLite::InferShape() const {
+  auto out_size = param_.out_size;
+  lite::DDim dims(std::vector<int64_t>({-1, out_size}));
+  param_.Out->Resize(dims);
+  return true;
+}
+
+bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto W = op_desc.Input("W").front();
+  auto b = op_desc.Input("b").front();
+  auto Out = op_desc.Output("Out").front();
+
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.W = scope->FindVar(W)->GetMutable<lite::Tensor>();
+  param_.b = scope->FindVar(b)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  param_.out_size = op_desc.GetAttr<int>("out_size");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_fc, paddle::lite::operators::SearchFcOpLite);
diff --git a/lite/operators/search_fc_op.h b/lite/operators/search_fc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a871cadd33b4f7d4b6130a0b8ac2974a738ac0c3
--- /dev/null
+++ b/lite/operators/search_fc_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchFcOpLite : public OpLite {
+ public:
+  SearchFcOpLite() {}
+  explicit SearchFcOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "search_fc"; }
+
+ private:
+  mutable SearchFcParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b56ae820bf9de4ffe6aa3f6db7a8e1385c8cc11f
--- /dev/null
+++ b/lite/operators/search_grnn_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_grnn_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchGrnnOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.wi);
+  CHECK_OR_FALSE(param_.wh);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.tmp_buffer);
+  CHECK_OR_FALSE(param_.idx_sorted_by_width);
+  CHECK_OR_FALSE(param_.layout_input);
+
+  int _cap_h = param_.num_hidden;
+  int _cap_e = param_.num_input;
+
+  const auto& x_dims = param_.x->dims();
+  CHECK_OR_FALSE(x_dims.size() == 2);
+  CHECK_OR_FALSE(x_dims[1] == _cap_e);
+
+  const auto& wi_dims = param_.wi->dims();
+  CHECK_OR_FALSE(wi_dims.size() == 3);
+  CHECK_OR_FALSE(wi_dims[0] == 3);
+  CHECK_OR_FALSE(wi_dims[1] == _cap_h);
+  CHECK_OR_FALSE(wi_dims[2] == _cap_e);
+
+  const auto& wh_dims = param_.wh->dims();
+  CHECK_OR_FALSE(wh_dims.size() == 3);
+  CHECK_OR_FALSE(wh_dims[0] == 3);
+  CHECK_OR_FALSE(wh_dims[1] == _cap_h);
+  CHECK_OR_FALSE(wh_dims[2] == _cap_h);
+
+  return true;
+}
+
+bool SearchGrnnOpLite::InferShape() const {
+  const auto& x_dims = param_.x->dims();
+  const auto& x_lod = param_.x->lod();
+  CHECK_OR_FALSE(!x_lod.empty());
+  CHECK_OR_FALSE(x_dims[0] == x_lod[0].back());
+  param_.out->set_lod(x_lod);
+
+  return true;
+}
+
+bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                  lite::Scope* scope) {
+  auto x = op_desc.Input("X").front();
+  auto wi = op_desc.Input("Wi").front();
+  auto wh = op_desc.Input("Wh").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.wi = scope->FindVar(wi)->GetMutable<lite::Tensor>();
+  param_.wh = scope->FindVar(wh)->GetMutable<lite::Tensor>();
+
+  param_.num_input = op_desc.GetAttr<int>("num_input");
+  param_.num_hidden = op_desc.GetAttr<int>("num_hidden");
+
+  auto out = op_desc.Output("Out").front();
+  auto tmp_buffer = op_desc.Output("tmp_buffer").front();
+  auto idx_sorted_by_width = op_desc.Output("idx_sorted_by_width").front();
+  auto layout_input = op_desc.Output("layout_input").front();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.tmp_buffer = scope->FindVar(tmp_buffer)->GetMutable<lite::Tensor>();
+  param_.idx_sorted_by_width =
+      scope->FindVar(idx_sorted_by_width)->GetMutable<lite::Tensor>();
+  param_.layout_input =
+      scope->FindVar(layout_input)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_grnn, paddle::lite::operators::SearchGrnnOpLite);
diff --git a/lite/operators/search_grnn_op.h b/lite/operators/search_grnn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..670af8a6c9ff9eafa33018a0303ea1a36b0a1e01
--- /dev/null
+++ b/lite/operators/search_grnn_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchGrnnOpLite : public OpLite {
+ public:
+  SearchGrnnOpLite() {}
+
+  explicit SearchGrnnOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "search_grnn"; }
+
+ private:
+  mutable SearchGrnnParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ba4dde275f4b9662416bdf5190cacfafc56a40d
--- /dev/null
+++ b/lite/operators/search_group_padding_op.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_group_padding_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchGroupPaddingOp::CheckShape() const {
+  CHECK_EQ(param_.x->dims().size(), 2) << "The rank of X(Input) should be 2.";
+  CHECK_EQ(param_.x->lod().empty(), false)
+      << "Input Tensor of X does not contain LoD information.";
+  CHECK_GE(param_.x->lod()[0].size(), 2)
+      << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(param_.x->dims()[0], static_cast<int64_t>(param_.x->lod()[0].back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+
+  return true;
+}
+
+bool SearchGroupPaddingOp::InferShape() const {
+  std::vector<int64_t> x_dims = param_.x->dims().Vectorize();
+
+  param_.out_emb_padding->Resize({-1, x_dims[1]});
+  param_.out_new->Resize({x_dims[0], 1});
+  param_.out_padding->Resize({-1, 1});
+  return true;
+}
+
+bool SearchGroupPaddingOp::AttachImpl(const cpp::OpDesc &op_desc,
+                                      lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto out_emb_padding = op_desc.Output("Out_emb_padding").front();
+  auto out_new = op_desc.Output("Out_new").front();
+  auto out_padding = op_desc.Output("Out_padding").front();
+
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.out_emb_padding =
+      scope->FindVar(out_emb_padding)->GetMutable<lite::Tensor>();
+  param_.out_new = scope->FindVar(out_new)->GetMutable<lite::Tensor>();
+  param_.out_padding = scope->FindVar(out_padding)->GetMutable<lite::Tensor>();
+  param_.pad_id = op_desc.GetAttr<int>("pad_id");
+
+  CHECK(param_.out_emb_padding)
+      << "Output(Out_emb_padding) of SearchGroupPadding Op should not be null.";
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_group_padding,
+                 paddle::lite::operators::SearchGroupPaddingOp);
diff --git a/lite/operators/search_group_padding_op.h b/lite/operators/search_group_padding_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8e96c9697b5f7de70349efa1f8b378a47c3823c
--- /dev/null
+++ b/lite/operators/search_group_padding_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchGroupPaddingOp : public OpLite {
+ public:
+  SearchGroupPaddingOp() {}
+  explicit SearchGroupPaddingOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "search_group_padding"; }
+
+ private:
+  mutable SearchGroupPaddingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_seq_depadding_op.cc b/lite/operators/search_seq_depadding_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12d5123e05b41665550fb7e6b90a636093959263
--- /dev/null
+++ b/lite/operators/search_seq_depadding_op.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_seq_depadding_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchSeqDepaddingOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.pad);
+  CHECK_OR_FALSE(param_.src);
+  CHECK_OR_FALSE(param_.out);
+
+  DDim pad_dims = param_.pad->dims();
+  DDim src_dims = param_.src->dims();
+  CHECK_OR_FALSE(pad_dims.size() == 2);
+  CHECK_OR_FALSE(src_dims.size() == 2);
+
+  const auto& pad_lod = param_.pad->lod();
+  CHECK_OR_FALSE(!pad_lod.empty());
+  const auto& pad_lod_0 = pad_lod[0];
+  CHECK_OR_FALSE(pad_lod_0.size() >= 2);
+  CHECK_OR_FALSE(pad_dims[0] == pad_lod_0.back());
+
+  const auto& src_lod = param_.src->lod();
+  CHECK_OR_FALSE(!src_lod.empty());
+  const auto& src_lod_0 = src_lod[0];
+  CHECK_OR_FALSE(src_lod_0.size() >= 2);
+  CHECK_OR_FALSE(src_dims[0] == src_lod_0.back());
+  return true;
+}
+
+bool SearchSeqDepaddingOpLite::InferShape() const {
+  DDim pad_dims = param_.pad->dims();
+  param_.out->Resize({-1, pad_dims[1]});
+  return true;
+}
+
+bool SearchSeqDepaddingOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                          lite::Scope* scope) {
+  auto pad = op_desc.Input("Pad").front();
+  auto src = op_desc.Input("Src").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.pad = scope->FindVar(pad)->GetMutable<lite::Tensor>();
+  param_.src = scope->FindVar(src)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_seq_depadding,
+                 paddle::lite::operators::SearchSeqDepaddingOpLite);
diff --git a/lite/operators/search_seq_depadding_op.h b/lite/operators/search_seq_depadding_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..445d9e0f3bcba6204243e80023d826bf53d90c60
--- /dev/null
+++ b/lite/operators/search_seq_depadding_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchSeqDepaddingOpLite : public OpLite {
+ public:
+  SearchSeqDepaddingOpLite() {}
+
+  explicit SearchSeqDepaddingOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "search_seq_depadding"; }
+
+ private:
+  mutable SearchSeqDepaddingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_seq_fc_op.cc b/lite/operators/search_seq_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5cca5331ab80479656b1212df02c20d463a3707
--- /dev/null
+++ b/lite/operators/search_seq_fc_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_seq_fc_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchSeqFcOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.w);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool SearchSeqFcOpLite::InferShape() const {
+  const auto x_dims = param_.x->dims();
+  const auto w_dims = param_.w->dims();
+  const auto& x_lod = param_.x->lod();
+  auto out_size = param_.out_size;
+  CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+  CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+  const auto& x_lod_0 = x_lod[0];
+  CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+  CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+
+  if (param_.b != nullptr) {
+    const auto b_dims = param_.b->dims();
+    CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+    CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+  }
+
+  param_.out->set_lod(x_lod);
+  param_.out->Resize({x_dims[0], w_dims[0]});
+  return true;
+}
+
+bool SearchSeqFcOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                   lite::Scope* scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("W").empty());
+  CHECK(!op_desc.Output("Out").empty());
+  auto x = op_desc.Input("X").front();
+  auto w = op_desc.Input("W").front();
+  auto out = op_desc.Output("Out").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.w = scope->FindVar(w)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.out_size = op_desc.GetAttr<int>("out_size");
+  bool has_bias = op_desc.GetAttr<bool>("has_bias");
+  if (has_bias) {
+    CHECK(!op_desc.Input("b").empty());
+    auto b = op_desc.Input("b").front();
+    param_.b = scope->FindVar(b)->GetMutable<lite::Tensor>();
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_seq_fc, paddle::lite::operators::SearchSeqFcOpLite);
diff --git a/lite/operators/search_seq_fc_op.h b/lite/operators/search_seq_fc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c4f7d82bfa66c2f323063f0297438c81ce18397
--- /dev/null
+++ b/lite/operators/search_seq_fc_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchSeqFcOpLite : public OpLite {
+ public:
+  SearchSeqFcOpLite() {}
+
+  explicit SearchSeqFcOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  std::string DebugString() const override { return "search_seq_fc"; }
+
+ private:
+  mutable SearchSeqFcParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_seq_softmax_op.cc b/lite/operators/search_seq_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..973ffa04c4562334af6d379b5446902036de8c5e
--- /dev/null
+++ b/lite/operators/search_seq_softmax_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_seq_softmax_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchSeqSoftmaxOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+
+bool SearchSeqSoftmaxOp::InferShape() const {
+  param_.output->Resize(param_.x->dims());
+  param_.output->set_lod(param_.x->lod());
+  return true;
+}
+
+bool SearchSeqSoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                    lite::Scope *scope) {
+  param_.x = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.output =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.axis = 1;
+
+  CHECK(param_.x);
+  CHECK(param_.output);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_seq_softmax,
+                 paddle::lite::operators::SearchSeqSoftmaxOp);
diff --git a/lite/operators/search_seq_softmax_op.h b/lite/operators/search_seq_softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f97e8ddd3a6c446fb5c53d5e603f43bbdf1e2525
--- /dev/null
+++ b/lite/operators/search_seq_softmax_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchSeqSoftmaxOp : public OpLite {
+ public:
+  SearchSeqSoftmaxOp() {}
+  explicit SearchSeqSoftmaxOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "search_seq_softmax_op"; }
+
+ private:
+  mutable SoftmaxParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29c39ebc23f54c2c3c052e322575d97570195cfc
--- /dev/null
+++ b/lite/operators/sequence_arithmetic_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_arithmetic_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceArithmeticOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_EQ(param_.X->dims().size(), 2) << "Input X should a 2-D Tensor";
+  CHECK_EQ(param_.Y->dims().size(), 2) << "Input Y should a 2-D Tensor";
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool SequenceArithmeticOp::InferShape() const {
+  param_.Out->Resize(param_.X->dims());
+  param_.Out->set_lod(param_.X->lod());
+  return true;
+}
+
+bool SequenceArithmeticOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                      lite::Scope *scope) {
+  param_.X = scope->FindTensor(opdesc.Input("X").front());
+  param_.Y = scope->FindTensor(opdesc.Input("Y").front());
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
+
+  param_.op_type = opdesc.GetAttr<int>("op_type");
+
+  CHECK(param_.X);
+  CHECK(param_.Y);
+  CHECK(param_.Out);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_arithmetic,
+                 paddle::lite::operators::SequenceArithmeticOp);
+REGISTER_LITE_OP(search_seq_arithmetic,
+                 paddle::lite::operators::SequenceArithmeticOp);
diff --git a/lite/operators/sequence_arithmetic_op.h b/lite/operators/sequence_arithmetic_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f844dfbf429599d829bc786c66ba6d05e40d79d
--- /dev/null
+++ b/lite/operators/sequence_arithmetic_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceArithmeticOp : public OpLite {
+ public:
+  SequenceArithmeticOp() {}
+  explicit SequenceArithmeticOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "sequence_arithmetic"; }
+
+ private:
+  mutable SequenceArithmeticParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a54df890cc6b90910713ed7d6d44f9218e72e28
--- /dev/null
+++ b/lite/operators/sequence_concat_op.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_concat_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceConcatOp::CheckShape() const {
+  CHECK_GT(param_.X.size(), 1)
+      << "The number of input sequences is at least two.";
+  CHECK_OR_FALSE(param_.Out);
+  size_t lod_size = 0;
+  for (const auto &t : param_.X) {
+    CHECK_EQ(t->lod().empty(), false)
+        << "Input Tensor of X does not contain LoD information.";
+    // CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now.";
+    if (lod_size == 0) {
+      lod_size = t->lod()[0].size();
+    } else {
+      CHECK_EQ(t->lod()[0].size(), lod_size)
+          << "The number of sequence must be same between each input";
+    }
+  }
+  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
+  return true;
+}
+
+bool SequenceConcatOp::InferShape() const {
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  std::vector<int64_t> out_dims;
+  for (const auto &tensor : param_.X) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  if (batch_size < 0) {
+    batch_size = -1;  // Normalize batch size for compile time.
+  }
+  out_dims[0] = batch_size;
+  param_.Out->Resize(out_dims);
+  // LoD info will be computed in Kernel.
+  return true;
+}
+
+bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                  lite::Scope *scope) {
+  auto input_list = opdesc.Input("X");
+  param_.X.clear();
+  for (auto var : input_list) {
+    param_.X.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null.";
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_concat, paddle::lite::operators::SequenceConcatOp);
diff --git a/lite/operators/sequence_concat_op.h b/lite/operators/sequence_concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cdc07ebca83b9c400b00a0f40556a788c5854e6
--- /dev/null
+++ b/lite/operators/sequence_concat_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceConcatOp : public OpLite {
+ public:
+  SequenceConcatOp() {}
+  explicit SequenceConcatOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_concat"; }
+
+ private:
+  mutable SequenceConcatParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd8fa2e8fd5816cc92355c9c73caf1aa76baf36c
--- /dev/null
+++ b/lite/operators/sequence_reverse_op.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_reverse_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceReverseOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_EQ(param_.X->lod().empty(), false)
+      << "Input(X) Tensor of SequenceReverseOp does not contain "
+         "LoD information.";
+  CHECK_GE(param_.X->dims().size(), 2)
+      << "Rank of Input(X) must be not less than 2.";
+  return true;
+}
+
+bool SequenceReverseOp::InferShape() const {
+  const auto *input = param_.X;
+  auto out_dims = input->dims();
+  param_.Out->Resize(out_dims);
+  return true;
+}
+
+bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                   lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Y").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.X);
+  CHECK(param_.Out);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_reverse, paddle::lite::operators::SequenceReverseOp);
diff --git a/lite/operators/sequence_reverse_op.h b/lite/operators/sequence_reverse_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..326d0f68927199e9353a5bbe8c072d342c9e3d69
--- /dev/null
+++ b/lite/operators/sequence_reverse_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceReverseOp : public OpLite {
+ public:
+  SequenceReverseOp() {}
+  explicit SequenceReverseOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_reverse"; }
+
+ private:
+  mutable SequenceReverseParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f5cbeeeee5816132d2ebcb7094949189931b931
--- /dev/null
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_topk_avg_pooling_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceTopkAvgPoolingOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.ROW);
+  CHECK_OR_FALSE(param_.COLUMN);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.pos);
+  return true;
+}
+
+bool SequenceTopkAvgPoolingOpLite::InferShape() const {
+  int channel_num = param_.channel_num;
+  std::vector<int> topks = param_.topks;
+  auto row_dim = param_.ROW->dims();
+  auto num_k = topks.size();
+  auto row_shape_0 = row_dim[0];
+  std::vector<int64_t> vec_out_shape;
+  vec_out_shape.push_back(row_shape_0);
+  vec_out_shape.push_back(channel_num * num_k);
+
+  param_.Out->Resize(lite::DDim(vec_out_shape));
+  param_.Out->set_lod(param_.ROW->lod());
+  return true;
+}
+
+bool SequenceTopkAvgPoolingOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                              lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto ROW = op_desc.Input("ROW").front();
+  auto COLUMN = op_desc.Input("COLUMN").front();
+  auto Out = op_desc.Output("Out").front();
+  auto pos = op_desc.Output("pos").front();
+
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.ROW = scope->FindVar(ROW)->GetMutable<lite::Tensor>();
+  param_.COLUMN = scope->FindVar(COLUMN)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  param_.pos = scope->FindVar(pos)->GetMutable<lite::Tensor>();
+  param_.channel_num = op_desc.GetAttr<int>("channel_num");
+  param_.topks = op_desc.GetAttr<std::vector<int>>("topks");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_topk_avg_pooling,
+                 paddle::lite::operators::SequenceTopkAvgPoolingOpLite);
diff --git a/lite/operators/sequence_topk_avg_pooling_op.h b/lite/operators/sequence_topk_avg_pooling_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c1cfe3a9c7bc82c3e79fc372b98293183509dca
--- /dev/null
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceTopkAvgPoolingOpLite : public OpLite {
+ public:
+  SequenceTopkAvgPoolingOpLite() {}
+  explicit SequenceTopkAvgPoolingOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override {
+    return "sequence_topk_avg_pooling";
+  }
+
+ private:
+  mutable SequenceTopkAvgPoolingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 18280616aa00b734596b620727f6dcfd5beb67d7..ec98a0d6c3ba3b1e5cd1c7992b58e96917d21057 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -39,8 +39,16 @@ bool SplitOp::InferShape() const {
   const int outs_number = outs.size();
   std::vector<lite::DDim> outs_dims;
   outs_dims.reserve(outs_number);
-
-  if (num > 0) {
+  std::vector<lite::Tensor *> sections_tensor_list_ =
+      param_.sections_tensor_list;
+  if (sections.size() > 0 && sections_tensor_list_.size() > 0) {
+    std::vector<int> vec_sections;
+    for (size_t i = 0; i < sections_tensor_list_.size(); ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections_tensor_list_[i]->data<int>()[0];
+      outs_dims.push_back(dim);
+    }
+  } else if (num > 0) {
     int out_axis_dim = in_dims[axis] / num;
     for (int i = 0; i < outs_number; ++i) {
       auto dim = in_dims;
@@ -55,6 +63,10 @@ bool SplitOp::InferShape() const {
     }
   }
 
+  if (param_.axis_tensor != nullptr) {
+    axis = param_.axis_tensor->data<int>()[0];
+  }
+
   for (int j = 0; j < outs_dims.size(); ++j) {
     outs[j]->Resize(outs_dims[j]);
   }
@@ -73,6 +85,21 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   for (auto var : outs) {
     param_.output.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
   }
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") !=
+      input_arg_names.end()) {
+    auto args = opdesc.Input("AxisTensor");
+    auto *var = scope->FindVar(args.front());
+    param_.axis_tensor = var->GetMutable<lite::Tensor>();
+  }
+  if (std::find(input_arg_names.begin(),
+                input_arg_names.end(),
+                "SectionsTensorList") != input_arg_names.end()) {
+    auto args = opdesc.Input("SectionsTensorList");
+    auto *var = scope->FindVar(args.front());
+    param_.sections_tensor_list =
+        *(var->GetMutable<std::vector<lite::Tensor *>>());
+  }
   return true;
 }
 
diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc
index 8db14d0660a7b48b94406e35908f0636a53d57f6..39b275b7b55f79f2c8daf16ab0a6acd2e76e8b48 100644
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
@@ -66,10 +66,7 @@ bool UnsqueezeOp::InferShape() const {
   std::vector<int> final_axes;
   auto axes = param_.axes;
   auto *axes_tensor = param_.axes_tensor;
-  std::vector<lite::Tensor> axes_tensor_vct;
-  if (param_.axes_tensor_vct) {
-    axes_tensor_vct = *(param_.axes_tensor_vct);
-  }
+  auto axes_tensor_vct = param_.axes_tensor_vct;
 
   if (!axes.empty()) {
     final_axes = axes;
@@ -79,7 +76,7 @@ bool UnsqueezeOp::InferShape() const {
                                   axes_tensor_data + axes_tensor->numel());
   } else if (!axes_tensor_vct.empty()) {
     for (int i = 0; i < axes_tensor_vct.size(); i++) {
-      final_axes.push_back(axes_tensor_vct[i].data<int>()[0]);
+      final_axes.push_back(axes_tensor_vct[i]->data<int>()[0]);
     }
   } else {
     LOG(FATAL) << "Input axis error";
@@ -114,16 +111,12 @@ bool UnsqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   if (opdesc.HasInput("AxesTensorList") &&
       opdesc.Input("AxesTensorList").size() > 0) {
     auto args = opdesc.Input("AxesTensorList");
-    /*
     for (auto arg : args) {
       auto *var = scope->FindVar(arg);
       if (var != nullptr) {
         param_.axes_tensor_vct.push_back(var->GetMutable<lite::Tensor>());
       }
     }
-    */
-    auto *var = scope->FindVar(args.front());
-    param_.axes_tensor_vct = var->GetMutable<std::vector<lite::Tensor>>();
   }
   CHECK(param_.X) << "Input(X) of UnsqueezeOp should not be null.";
   CHECK(param_.Out) << "Output(Out) of UnsqueezeOp should not be null.";
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c7fe374fc90b20ee44df3d1619f44109b7387c0
--- /dev/null
+++ b/lite/operators/var_conv_2d_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/var_conv_2d_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool VarConv2dOp::CheckShape() const {
+  auto x_dims = param_.X->dims();
+  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2.";
+  auto w_dims = param_.W->dims();
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor";
+  CHECK_EQ(w_dims[0], param_.output_channel)
+      << "W dim[0] should be equal to OutputChannel";
+  CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w)
+      << "W dim[1] should be equal to InputChannel * KernelH * KernelW";
+  LoD x_lod = param_.X->lod();
+  CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info.";
+  // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted.";
+  CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  // LoD row_lod = param_.ROW->lod();
+  // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info.";
+  // LoD col_lod = param_.COLUMN->lod();
+  // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod
+  // info.";
+  return true;
+}
+
+bool VarConv2dOp::InferShape() const { return true; }
+
+bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  // param_.ROW = const_cast<lite::Tensor *>(
+  //     &scope->FindVar(opdesc.Input("ROW").front())->Get<lite::Tensor>());
+  // param_.COLUMN = const_cast<lite::Tensor *>(
+  //     &scope->FindVar(opdesc.Input("COLUMN").front())->Get<lite::Tensor>());
+  param_.W = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("W").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.Col =
+      scope->FindVar(opdesc.Output("Col").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null.";
+  // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null.";
+  // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null.";
+  CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null.";
+  CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null.";
+  CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null.";
+  param_.output_channel = opdesc.GetAttr<int>("OutputChannel");
+  param_.input_channel = opdesc.GetAttr<int>("InputChannel");
+  param_.kernel_h = opdesc.GetAttr<int>("KernelH");
+  param_.kernel_w = opdesc.GetAttr<int>("KernelW");
+  param_.stride_h = opdesc.GetAttr<int>("StrideH");
+  param_.stride_w = opdesc.GetAttr<int>("StrideW");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(var_conv_2d, paddle::lite::operators::VarConv2dOp);
diff --git a/lite/operators/var_conv_2d_op.h b/lite/operators/var_conv_2d_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce6309419cc582c2f93250dd6e8e59c04a951f91
--- /dev/null
+++ b/lite/operators/var_conv_2d_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class VarConv2dOp : public OpLite {
+ public:
+  VarConv2dOp() {}
+  explicit VarConv2dOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "var_conv_2d"; }
+
+ private:
+  mutable VarConv2DParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index 7c0f867fae4bca1957ba1610db5f40b8c8dbabdf..eefd30f74f570f64d1b5617c9dddc836086394b1 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -17,8 +17,8 @@
 #include <math.h>
 #include <random>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/tests/cv/cv_basic.h"
-#include "lite/tests/utils/timer.h"
 #include "lite/utils/cv/paddle_image_preprocess.h"
 
 DEFINE_int32(cluster, 3, "cluster id");
@@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
 typedef paddle::lite_api::Tensor Tensor_api;
 typedef paddle::lite::Tensor Tensor;
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
   uint seed = 256;
@@ -285,8 +285,8 @@ void test_img(const std::vector<int>& cluster_id,
       ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
 
       for (int i = 0; i < test_iter; ++i) {
-        t1.clear();
-        t1.start();
+        t1.Reset();
+        t1.Start();
 
         LOG(INFO) << "image convert saber compute";
         // 方法一: image_preprocess.imageCovert(src, lite_dst);
@@ -329,8 +329,8 @@ void test_img(const std::vector<int>& cluster_id,
                                       means,
                                       scales);
 
-        t1.end();
-        double tdiff = t1.get_average_ms();
+        t1.Stop();
+        double tdiff = t1.LapTimes().Avg();
         to += tdiff;
         if (tdiff < min_time) {
           min_time = tdiff;
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 02d40ce6cc4acfd582fb148f10aafc654ee13be0..549fabab5a20b7757585eacdc2fe4db64e0aaadf 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -39,6 +39,8 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
     lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/bilinear_interp_compute_test.cc b/lite/tests/kernels/bilinear_interp_compute_test.cc
index 0779caf67aac907e6f8ccde8b3e65d413cf65db9..7ea4293f080df31d9bb05b4998b5b2d9ae7d5a47 100644
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ b/lite/tests/kernels/bilinear_interp_compute_test.cc
@@ -22,6 +22,27 @@
 namespace paddle {
 namespace lite {
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*(tensor->data<int32_t>())));
+  }
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 template <typename dtype>
 void resize_bilinear_align(std::vector<const lite::Tensor*> inputs,
                            lite::Tensor* output) {
@@ -149,6 +170,9 @@ class BilinearInterpComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
   std::string input0_ = "X";
+  std::string sizetensor0_ = "SizeTensor0";
+  std::string sizetensor1_ = "SizeTensor1";
+  std::string input_scale_ = "Scale";
   std::string input1_ = "OutSize";
   std::string output_ = "Out";
 
@@ -162,6 +186,8 @@ class BilinearInterpComputeTester : public arena::TestCase {
   std::string interp_method_ = "Bilinear";
   DDim _dims0_{{1, 1, 16, 16}};
   DDim _dims1_{{2}};
+  DDim sizetensor_dims_{{1}};
+  DDim scale_dims_{{1}};
 
  public:
   BilinearInterpComputeTester(const Place& place,
@@ -190,33 +216,48 @@ class BilinearInterpComputeTester : public arena::TestCase {
     if (outsize_height_ > 0 && outsize_width_ > 0) {
       inputs.emplace_back(scope->FindTensor(input1_));
     }
+    std::vector<const lite::Tensor*> SizeTensor;
+    if (outsize_height_ > 0 && outsize_width_ > 0) {
+      SizeTensor.emplace_back(scope->FindTensor(sizetensor0_));
+      SizeTensor.emplace_back(scope->FindTensor(sizetensor1_));
+    }
+    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
+    float scale = height_scale_;
+    int in_h = inputs[0]->dims()[2];
+    int in_w = inputs[0]->dims()[3];
+    if (SizeTensor.size() > 0) {
+      auto new_size = get_new_shape(SizeTensor);
+      out_height_ = new_size[0];
+      out_width_ = new_size[1];
+    } else {
+      auto scale_tensor = input_scale;
+      if (scale_tensor != nullptr) {
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+        scale = scale_data[0];
+      }
+      if (scale > 0) {
+        out_height_ = static_cast<int>(in_h * scale);
+        out_width_ = static_cast<int>(in_w * scale);
+      }
+      if (inputs.size() > 1) {
+        auto out_size = inputs[1];
+        auto out_size_data = get_new_data_from_tensor<int>(out_size);
+        out_height_ = out_size_data[0];
+        out_width_ = out_size_data[1];
+      }
+    }
+    height_scale_ = scale;
+    width_scale_ = scale;
+
     if (out_width_ != -1 && out_height_ != -1) {
       height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
       width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
     }
     auto* outputs = scope->NewTensor(output_);
     CHECK(outputs);
-    if (inputs.size() > 1) {
-      auto outsize_data = inputs[1]->data<int>();
-      int h_out = outsize_data[0];  // HW
-      int w_out = outsize_data[1];  // HW
-      int num_cout = inputs[0]->dims()[0];
-      int c_cout = inputs[0]->dims()[1];
-      outputs->Resize({num_cout, c_cout, h_out, w_out});
-    } else {
-      int out_h;
-      int out_w;
-      if (-1 == out_height_ && -1 == out_width_) {
-        out_h = inputs[0]->dims()[2] * height_scale_;
-        out_w = inputs[0]->dims()[3] * width_scale_;
-      } else {
-        out_h = out_height_;
-        out_w = out_width_;
-      }
-      outputs->Resize(
-          {inputs[0]->dims()[0], inputs[0]->dims()[1], out_h, out_w});
-    }
-
+    int num_cout = inputs[0]->dims()[0];
+    int c_cout = inputs[0]->dims()[1];
+    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
     if (align_corners_) {
       resize_bilinear_align<float>(inputs, outputs);
     } else {
@@ -229,6 +270,10 @@ class BilinearInterpComputeTester : public arena::TestCase {
     op_desc->SetInput("X", {input0_});
     if (outsize_height_ > 0 && outsize_width_ > 0) {
       op_desc->SetInput("OutSize", {input1_});
+      op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
+    }
+    if (height_scale_ > 0) {
+      op_desc->SetInput("Scale", {input_scale_});
     }
     op_desc->SetOutput("Out", {output_});
     op_desc->SetAttr("scale", height_scale_);
@@ -250,6 +295,19 @@ class BilinearInterpComputeTester : public arena::TestCase {
       data1[0] = outsize_height_;
       data1[1] = outsize_width_;
       SetCommonTensor(input1_, _dims1_, data1.data());
+
+      std::vector<int> sizetensor_data(1);
+      sizetensor_data[0] = outsize_height_;
+      SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
+
+      sizetensor_data[0] = outsize_width_;
+      SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
+    }
+
+    if (height_scale_ > 0) {
+      std::vector<float> scale_data(1);
+      scale_data[0] = height_scale_;
+      SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
     }
   }
 };
diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc
index a287f0bb6610921e0f048fcc4d46f8729dd177c1..6c348076ba82490a599b9916826c59dabf91f870 100644
--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc
@@ -31,8 +31,10 @@ void col2im(const Dtype* data_col,
             const int width,
             const int kernel_h,
             const int kernel_w,
-            const int pad_h,
-            const int pad_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
             const int stride_h,
             const int stride_w,
             const int dilation_h,
@@ -40,19 +42,22 @@ void col2im(const Dtype* data_col,
             Dtype* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(float));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 data_im[input_row * width + input_col] += *data_col;
@@ -104,6 +109,34 @@ void fill_bias_relu<float>(float* tensor,
   }
 }
 
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const DDim data_dims,
+                                     const std::vector<int>& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 template <typename type, typename type2>
 static void basic_gemm(int m,
                        int n,
@@ -172,8 +205,10 @@ bool deconv_basic(const Dtype1* din,
                   int stride_h,
                   int dila_w,
                   int dila_h,
-                  int pad_w,
-                  int pad_h,
+                  int pad_w0,
+                  int pad_w1,
+                  int pad_h0,
+                  int pad_h1,
                   bool flag_bias,
                   bool flag_relu) {
   int m = chout * kernel_w * kernel_h / group;
@@ -193,8 +228,9 @@ bool deconv_basic(const Dtype1* din,
   int group_size_coldata = m * n;
   int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
   bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
+                      (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) &&
+                      (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) &&
+                      (dila_h == 1);
 
   for (int i = 0; i < num; ++i) {
     const Dtype1* din_batch = din + i * chin * hin * win;
@@ -204,7 +240,7 @@ bool deconv_basic(const Dtype1* din,
     if (flag_1x1s1p1) {
       col_data = dout_batch;
     }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
+    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group);
     for (int g = 0; g < group; ++g) {
       const Dtype1* din_group = din_batch + g * group_size_in;
       const Dtype1* weights_group = weights + g * group_size_weights;
@@ -230,8 +266,10 @@ bool deconv_basic(const Dtype1* din,
              wout,
              kernel_h,
              kernel_w,
-             pad_h,
-             pad_w,
+             pad_h0,
+             pad_h1,
+             pad_w0,
+             pad_w1,
              stride_h,
              stride_w,
              dila_h,
@@ -253,9 +291,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
   std::string output_ = "out";
   std::string filter_ = "filter";
   std::string bias_ = "bias";
+  std::string padding_algorithm_ = "";
 
   std::vector<int> strides_{1, 1};
-  std::vector<int> paddings_{0, 0};
+  std::vector<int> paddings_{0, 0, 0, 0};
   int groups_{1};
   std::vector<int> dilations_{1, 1};
   bool flag_relu_{false};
@@ -280,9 +319,13 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
                                bool flag_relu,
                                int dilation,
                                int stride,
-                               int padding,
+                               int pad_h0,
+                               int pad_h1,
+                               int pad_w0,
+                               int pad_w1,
                                int ks,
-                               int groups)
+                               int groups,
+                               std::string padding_algorithm)
       : TestCase(place, alias) {
     n_ = n;
     ic_ = ic;
@@ -291,20 +334,29 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
     iw_ = iw;
     ks_ = ks;
     flag_bias_ = flag_bias;
-
+    padding_algorithm_ = padding_algorithm;
     strides_ = std::vector<int>({stride, stride});
-    paddings_ = std::vector<int>({padding, padding});
-    groups_ = groups;
+    paddings_ = std::vector<int>({pad_h0, pad_h1, pad_w0, pad_w1});
     dilations_ = std::vector<int>({dilation, dilation});
+    groups_ = groups;
     flag_relu_ = flag_relu;
   }
 
   void RunBaseline(Scope* scope) override {
     auto* out = scope->NewTensor(output_);
     CHECK(out);
-    int oh = (ih_ - 1) * strides_[0] - 2 * paddings_[0] +
+    auto* x = scope->FindTensor(x_);
+    auto input_dim = x->dims();
+    std::vector<int> ksize({1, 1, ks_, ks_});
+    UpdatePaddingAndDilation(&paddings_,
+                             &dilations_,
+                             strides_,
+                             padding_algorithm_,
+                             input_dim,
+                             ksize);
+    int oh = (ih_ - 1) * strides_[0] - paddings_[0] - paddings_[1] +
              dilations_[0] * (ks_ - 1) + 1;
-    int ow = (iw_ - 1) * strides_[1] - 2 * paddings_[1] +
+    int ow = (iw_ - 1) * strides_[1] - paddings_[2] - paddings_[3] +
              dilations_[1] * (ks_ - 1) + 1;
     CHECK(oh > 0 || ow > 0);
 
@@ -313,7 +365,6 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
     out->Resize(output_dims);
     auto* output_data = out->mutable_data<float>();
 
-    auto* x = scope->FindTensor(x_);
     const auto* x_data = x->data<float>();
     auto* filter = scope->FindTensor(filter_);
     const auto* filter_data = filter->data<float>();
@@ -341,8 +392,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
                                strides_[0],
                                dilations_[1],
                                dilations_[0],
-                               paddings_[1],
+                               paddings_[2],
+                               paddings_[3],
                                paddings_[0],
+                               paddings_[1],
                                flag_bias_,
                                flag_relu_);
   }
@@ -360,6 +413,7 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
       op_desc->SetInput("Bias", {bias_});
     }
     op_desc->SetAttr("fuse_relu", flag_relu_);
+    op_desc->SetAttr("padding_algorithm", padding_algorithm_);
   }
 
   void PrepareData() override {
@@ -402,49 +456,66 @@ TEST(conv2d_transpose, precision) {
   LOG(INFO) << "test conv2d_transpose op";
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
-  for (auto n : {1, 2}) {
+  for (auto n : {2}) {
     for (auto ic : {1, 4 /*, 128*/}) {
       for (auto oc : {1, 4 /*, 128*/}) {
         LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc;
-        for (auto ih : {8, 16 /*, 56 , 112, 224, 512*/}) {
+        for (auto ih : {8, 8 /*, 56 , 112, 224, 512*/}) {
           for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) {
             for (auto flag_bias : {false, true}) {
               for (auto flag_relu : {false, true}) {
                 for (auto dilation : {1, 2}) {
                   for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 2}) {
-                      for (auto ks : {2, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          // LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" <<
-                          // oc
-                          //           << ",ih:" << ih << ",iw:" << iw
-                          //           << ",flag_bias:" << flag_bias
-                          //           << ",flag_relu:" << flag_relu
-                          //           << ",dila:" << dilation
-                          //           << ",stride:" << stride
-                          //           << ",padding:" << padding << ",ks:" << ks
-                          //           << ",group:" << group;
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
+                    for (auto pad_h0 : {0, 1}) {
+                      for (auto pad_h1 : {0, 1}) {
+                        for (auto pad_w0 : {0, 1}) {
+                          for (auto pad_w1 : {0, 1}) {
+                            for (auto ks : {1, 4}) {
+                              for (auto group : {1, 2}) {
+                                for (auto padding_algorithm :
+                                     {"", "SAME", "VALID"}) {
+                                  // obtain shape
+                                  // LOG(INFO) << "n:" << n << ",ic:" << ic <<
+                                  // ",oc:" <<
+                                  // oc
+                                  //           << ",ih:" << ih << ",iw:" << iw
+                                  //           << ",flag_bias:" << flag_bias
+                                  //           << ",flag_relu:" << flag_relu
+                                  //           << ",dila:" << dilation
+                                  //           << ",stride:" << stride
+                                  //           << ",padding:" << padding <<
+                                  //           ",ks:" << ks
+                                  //           << ",group:" << group;
+                                  if (ic % group != 0 || oc % group != 0) {
+                                    group = 1;
+                                  }
+                                  std::unique_ptr<arena::TestCase> tester(
+                                      new Conv2DTransposeComputeTester(
+                                          place,
+                                          "def",
+                                          n,
+                                          ic,
+                                          oc,
+                                          ih,
+                                          iw,
+                                          flag_bias,
+                                          flag_relu,
+                                          dilation,
+                                          stride,
+                                          pad_h0,
+                                          pad_h1,
+                                          pad_w0,
+                                          pad_w1,
+                                          ks,
+                                          group,
+                                          padding_algorithm));
+                                  arena::Arena arena(
+                                      std::move(tester), place, 2e-5);
+                                  arena.TestPrecision();
+                                }
+                              }
+                            }
                           }
-                          std::unique_ptr<arena::TestCase> tester(
-                              new Conv2DTransposeComputeTester(place,
-                                                               "def",
-                                                               n,
-                                                               ic,
-                                                               oc,
-                                                               ih,
-                                                               iw,
-                                                               flag_bias,
-                                                               flag_relu,
-                                                               dilation,
-                                                               stride,
-                                                               padding,
-                                                               ks,
-                                                               group));
-                          arena::Arena arena(std::move(tester), place, 2e-5);
-                          arena.TestPrecision();
                         }
                       }
                     }
diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e211582b04d279b535f0d3873a9b0c537e375a60
--- /dev/null
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class FillConstantComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string out_ = "out";
+  int dtype_{static_cast<int>(VarDescAPI::VarDataType::FP32)};
+  std::vector<int64_t> shape_{};
+  std::string shape_tensor_ = "ShapeTensor";
+  std::vector<std::string> shape_tensor_list_;
+  bool is_use_shape_tensor_{false};
+  bool is_use_shape_tensor_list_{false};
+
+  float value_{0.0f};
+  // useless for x86, keep it for compatibility
+  bool force_cpu_{false};
+  // DDim shape_tensor_data{{5, 3}};
+  std::vector<int32_t> shape_tensor_data;
+  DDim shape_test{{1, 2}};
+
+ public:
+  FillConstantComputeTester(const Place& place,
+                            const std::string& alias,
+                            std::vector<int64_t> shape,
+                            const bool is_use_shape_tensor,
+                            const bool is_use_shape_tensor_list,
+                            float value,
+                            bool force_cpu)
+      : TestCase(place, alias) {
+    shape_ = shape;
+    value_ = value;
+    force_cpu_ = force_cpu;
+    is_use_shape_tensor_ = is_use_shape_tensor;
+    is_use_shape_tensor_list_ = is_use_shape_tensor_list;
+
+    for (int i = 0; i < shape_test.size(); i++) {
+      shape_tensor_data.push_back(i + 1);
+    }
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(out_);
+    DDim output_dims{shape_};
+    if (is_use_shape_tensor_) {
+      auto* temp_shape = scope->FindTensor(shape_tensor_);
+      auto* shape_data = temp_shape->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + temp_shape->numel());
+      output_dims.ConstructFrom(vec_shape);
+    }
+    if (is_use_shape_tensor_list_) {
+      std::vector<int64_t> vec_shape;
+      for (int i = 0; i < shape_tensor_list_.size(); i++) {
+        auto* temp_shape = scope->FindTensor(shape_tensor_list_[i]);
+        vec_shape.push_back(*temp_shape->data<int>());
+      }
+
+      output_dims.ConstructFrom(vec_shape);
+    }
+    out->Resize(output_dims);
+
+    auto* output_data = out->mutable_data<float>();
+    for (int i = 0; i < out->numel(); i++) {
+      output_data[i] = value_;
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    LOG(INFO) << "PrepareOpDesc";
+
+    op_desc->SetType("fill_constant");
+    op_desc->SetAttr("dtype", dtype_);
+    op_desc->SetAttr("shape", shape_);
+    op_desc->SetAttr("value", value_);
+    op_desc->SetAttr("force_cpu", force_cpu_);
+    if (is_use_shape_tensor_) {
+      op_desc->SetInput("ShapeTensor", {shape_tensor_});
+    }
+    if (is_use_shape_tensor_list_) {
+      // std::vector<std::string> shape_tensor_list_;
+      for (int i = 0; i < shape_test.size(); ++i) {
+        shape_tensor_list_.push_back("shape_tensor_list_" + std::to_string(i));
+      }
+      op_desc->SetInput("ShapeTensorList", {shape_tensor_list_});
+    }
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    if (is_use_shape_tensor_) {
+      // std::vector<int64_t> temp = x_dims_.data();
+      // int64_t* data = temp.data();
+      SetCommonTensor(shape_tensor_, shape_test, shape_tensor_data.data());
+    }
+    if (is_use_shape_tensor_list_) {
+      Scope& scope_ = this->scope();
+      for (int i = 0; i < shape_test.size(); ++i) {
+        auto* tensor =
+            scope_.NewTensor("shape_tensor_list_" + std::to_string(i));
+        tensor->Resize(DDim({1}));
+        auto* d = tensor->mutable_data<int>();
+        d[0] = shape_tensor_data[i];
+      }
+    }
+  }
+};
+
+TEST(fill_constant, precision) {
+  LOG(INFO) << "test fill_constant op, kARM";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  std::vector<int64_t> shape{1, 2};
+
+  for (int dtype : {static_cast<int>(VarDescAPI::VarDataType::INT32)}) {
+    for (float value : {1, 2}) {
+      for (bool is_use_shape_tensor_list : {false, true}) {
+        for (bool is_use_shape_tensor : {false, true}) {
+          if (is_use_shape_tensor && is_use_shape_tensor_list) break;
+          LOG(INFO) << "value:" << value
+                    << ", is_use_shape_tensor:" << is_use_shape_tensor
+                    << ", is_use_shape_tensor_list:"
+                    << is_use_shape_tensor_list;
+
+          std::unique_ptr<arena::TestCase> tester(
+              new FillConstantComputeTester(place,
+                                            "def",
+                                            shape,
+                                            is_use_shape_tensor,
+                                            is_use_shape_tensor_list,
+                                            value,
+                                            false));
+          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+#endif
+
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  LOG(INFO) << "test concate op, x86";
+  for (int axis : {1, 2}) {
+    for (bool is_use_axis_tensor : {false, true}) {
+      LOG(INFO) << "axis:" << axis
+                << ", is_use_axis_tensor:" << is_use_axis_tensor;
+      std::unique_ptr<arena::TestCase> tester(
+          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
+      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena.TestPrecision();
+    }
+  }
+
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/lrn_compute_test.cc b/lite/tests/kernels/lrn_compute_test.cc
index 9ee43c5c60b4703f64e7a2575ec15ba59b618052..e306155514e7423dfcfccb3d7103050b50f9fdbe 100644
--- a/lite/tests/kernels/lrn_compute_test.cc
+++ b/lite/tests/kernels/lrn_compute_test.cc
@@ -158,7 +158,7 @@ class LrnComputeTester : public arena::TestCase {
     op_desc->SetOutput("Out", {output_});
     op_desc->SetAttr("alpha", alpha_);
     op_desc->SetAttr("beta", beta_);
-    op_desc->SetAttr("local_size", local_size_);
+    op_desc->SetAttr("n", local_size_);
     op_desc->SetAttr("k", k_);
     op_desc->SetAttr("norm_region", norm_region_);
   }
diff --git a/lite/tests/kernels/nearest_interp_compute_test.cc b/lite/tests/kernels/nearest_interp_compute_test.cc
index 3256ababcab639cd31ef51294a890b7fbdb54d5d..894959f9090cce8a391c146815f550d5f42adcb6 100644
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ b/lite/tests/kernels/nearest_interp_compute_test.cc
@@ -22,6 +22,28 @@
 namespace paddle {
 namespace lite {
 
+inline std::vector<int> get_new_shape(
+    const std::vector<const lite::Tensor*>& list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 template <typename dtype>
 void resize_nearest_align(std::vector<const lite::Tensor*> inputs,
                           lite::Tensor* output,
@@ -73,6 +95,9 @@ class NearestInterpComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
   std::string input0_ = "X";
+  std::string sizetensor0_ = "SizeTensor0";
+  std::string sizetensor1_ = "SizeTensor1";
+  std::string input_scale_ = "Scale";
   std::string input1_ = "OutSize";
   std::string output_ = "Out";
 
@@ -85,6 +110,8 @@ class NearestInterpComputeTester : public arena::TestCase {
   DDim dims_{{2, 3}};
   DDim _dims0_{{2, 3, 3, 2}};
   DDim _dims1_{{2}};
+  DDim sizetensor_dims_{{1}};
+  DDim scale_dims_{{1}};
 
  public:
   NearestInterpComputeTester(const Place& place,
@@ -112,24 +139,54 @@ class NearestInterpComputeTester : public arena::TestCase {
     inputs.emplace_back(scope->FindTensor(input0_));
     inputs.emplace_back(scope->FindTensor(input1_));
 
-    auto outsize_data = inputs[1]->data<int>();
+    std::vector<const lite::Tensor*> SizeTensor(2);
+    SizeTensor[0] = scope->FindTensor(sizetensor0_);
+    SizeTensor[1] = scope->FindTensor(sizetensor1_);
+    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
+
+    float scale = height_scale_;
+    int in_h = inputs[0]->dims()[2];
+    int in_w = inputs[0]->dims()[3];
+    if (SizeTensor.size() > 0) {
+      auto new_size = get_new_shape(SizeTensor);
+      out_height_ = new_size[0];
+      out_width_ = new_size[1];
+    } else {
+      auto scale_tensor = input_scale;
+      if (scale_tensor != nullptr) {
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+        scale = scale_data[0];
+      }
+      if (scale > 0) {
+        out_height_ = static_cast<int>(in_h * scale);
+        out_width_ = static_cast<int>(in_w * scale);
+      }
+      auto out_size = inputs[1];
+      if (out_size != nullptr) {
+        auto out_size_data = get_new_data_from_tensor<int>(out_size);
+        out_height_ = out_size_data[0];
+        out_width_ = out_size_data[1];
+      }
+    }
+    height_scale_ = scale;
+    width_scale_ = scale;
+
     if (out_width_ != -1 && out_height_ != -1) {
       height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
       width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
     }
-    if (inputs.size() > 1) {
-      int h_out = outsize_data[0];  // HW
-      int w_out = outsize_data[1];  // HW
-      int num_cout = outputs->dims()[0];
-      int c_cout = outputs->dims()[1];
-      outputs->Resize({num_cout, c_cout, h_out, w_out});
-    }
+    int num_cout = inputs[0]->dims()[0];
+    int c_cout = inputs[0]->dims()[1];
+    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
+
     resize_nearest_align<float>(inputs, outputs, align_corners_);
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("nearest_interp");
     op_desc->SetInput("X", {input0_});
+    op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
+    op_desc->SetInput("Scale", {input_scale_});
     op_desc->SetInput("OutSize", {input1_});
     op_desc->SetOutput("Out", {output_});
     op_desc->SetAttr("scale", height_scale_);
@@ -152,6 +209,17 @@ class NearestInterpComputeTester : public arena::TestCase {
 
     SetCommonTensor(input0_, _dims0_, data0.data());
     SetCommonTensor(input1_, _dims1_, data1.data());
+
+    std::vector<int> sizetensor_data(1);
+    sizetensor_data[0] = out_height_;
+    SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
+
+    sizetensor_data[0] = out_width_;
+    SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
+
+    std::vector<float> scale_data(1);
+    scale_data[0] = height_scale_;
+    SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
   }
 };
 
diff --git a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb824931ae9ae1e8472dd3f368a04c24e72aa291
--- /dev/null
+++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+
+namespace paddle {
+namespace lite {
+
+class SearchAlignedMatMulComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "X";
+  std::string y_ = "Y";
+  bool x_transpose_;
+  bool y_transpose_;
+  float alpha_;
+  std::string out_ = "Out";
+  DDim x_dims_;
+  DDim y_dims_;
+  LoD x_lod_;
+  LoD y_lod_;
+
+ public:
+  SearchAlignedMatMulComputeTester(const Place& place,
+                                   const std::string& alias,
+                                   bool x_transpose,
+                                   bool y_transpose,
+                                   float alpha,
+                                   const DDim& x_dims,
+                                   const DDim& y_dims,
+                                   const LoD& x_lod,
+                                   const LoD& y_lod)
+      : TestCase(place, alias),
+        x_transpose_(x_transpose),
+        y_transpose_(y_transpose),
+        alpha_(alpha),
+        x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_lod_(x_lod),
+        y_lod_(y_lod) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto y = scope->FindTensor(y_);
+    CHECK(x);
+    CHECK(y);
+    const auto x_data = x->data<float>();
+    const auto y_data = y->data<float>();
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+
+    const auto x_dims = x->dims();
+    const auto y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose_ ? x_inner_size : x_batch_size;
+    int N = y_transpose_ ? y_batch_size : y_inner_size;
+    int X_K = x_transpose_ ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose_ ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+    int x_stride = x_batch_size * x_inner_size;
+    int y_stride = y_batch_size * y_inner_size;
+    int out_stride = M * N;
+    int lda = x_transpose_ ? M : K;
+    int ldb = y_transpose_ ? K : N;
+    int ldc = N;
+
+    LoD out_lod;
+    std::vector<uint64_t> out_lod_0(seq_num + 1);
+    out_lod_0[0] = 0;
+    for (int i = 0; i < seq_num; i++) {
+      out_lod_0[i + 1] = out_lod_0[i] + M;
+    }
+    out_lod.push_back(out_lod_0);
+    DDim out_dims(
+        {static_cast<int64_t>(out_lod_0.back()), static_cast<int64_t>(N)});
+    out->set_lod(out_lod);
+    out->Resize(out_dims);
+
+    auto out_data = out->mutable_data<float>();
+    for (int i = 0; i < seq_num; i++) {
+      basic_gemm<float, float>(x_transpose_,
+                               y_transpose_,
+                               M,
+                               N,
+                               K,
+                               alpha_,
+                               x_data + i * x_stride,
+                               lda,
+                               y_data + i * y_stride,
+                               ldb,
+                               0,
+                               out_data + i * out_stride,
+                               ldc,
+                               nullptr,
+                               false,
+                               false);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("search_aligned_mat_mul");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("transpose_X", x_transpose_);
+    op_desc->SetAttr("transpose_Y", y_transpose_);
+    op_desc->SetAttr("alpha", alpha_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    std::vector<float> y_data(y_dims_.production());
+    fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y_data.data(), -1.f, 1.f, y_dims_.production());
+    SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_);
+    SetCommonTensor(y_, y_dims_, y_data.data(), y_lod_);
+  }
+};
+
+void test_search_aligned_mat_mul(Place place) {
+  for (int seq_num : {1, 2}) {
+    for (int x_batch_size : {1, 3}) {
+      for (int x_inner_size : {1, 5}) {
+        for (int out_inner_size : {1, 4}) {
+          for (bool x_transpose : {true, false}) {
+            for (bool y_transpose : {true, false}) {
+              for (float alpha : {1., 2.}) {
+                // infer x_dims and y_dims
+                int y_batch_size;
+                int y_inner_size;
+                if (x_transpose) {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_batch_size;
+                  } else {
+                    y_batch_size = x_batch_size;
+                    y_inner_size = out_inner_size;
+                  }
+                } else {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_inner_size;
+                  } else {
+                    y_batch_size = x_inner_size;
+                    y_inner_size = out_inner_size;
+                  }
+                }
+                std::vector<uint64_t> x_lod_0(seq_num + 1);
+                std::vector<uint64_t> y_lod_0(seq_num + 1);
+                x_lod_0[0] = 0;
+                y_lod_0[0] = 0;
+                for (int i = 0; i < seq_num; i++) {
+                  x_lod_0[i + 1] = x_lod_0[i] + x_batch_size;
+                  y_lod_0[i + 1] = y_lod_0[i] + y_batch_size;
+                }
+                LoD x_lod;
+                LoD y_lod;
+                x_lod.push_back(x_lod_0);
+                y_lod.push_back(y_lod_0);
+                DDim x_dims({static_cast<int64_t>(x_lod_0.back()),
+                             static_cast<int64_t>(x_inner_size)});
+                DDim y_dims({static_cast<int64_t>(y_lod_0.back()),
+                             static_cast<int64_t>(y_inner_size)});
+
+                std::unique_ptr<arena::TestCase> tester(
+                    new SearchAlignedMatMulComputeTester(place,
+                                                         "def",
+                                                         x_transpose,
+                                                         y_transpose,
+                                                         alpha,
+                                                         x_dims,
+                                                         y_dims,
+                                                         x_lod,
+                                                         y_lod));
+                arena::Arena arena(std::move(tester), place, 5e-4);
+                arena.TestPrecision();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(SearchAlignedMatMul, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_search_aligned_mat_mul(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/search_seq_fc_compute_test.cc b/lite/tests/kernels/search_seq_fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..988d3a27cc238a57ae18de81a2dad619f8b4a9f0
--- /dev/null
+++ b/lite/tests/kernels/search_seq_fc_compute_test.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+
+namespace paddle {
+namespace lite {
+
+class SearchSeqFcOPTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string w_ = "w";
+  std::string b_ = "b";
+  std::string out_ = "out";
+  DDim x_dims_;
+  DDim w_dims_;
+  DDim b_dims_;
+  LoD x_lod_;
+  bool has_bias_;
+  int out_size_;
+
+ public:
+  SearchSeqFcOPTest(const Place& place,
+                    const std::string& alias,
+                    DDim x_dims,
+                    DDim w_dims,
+                    DDim b_dims,
+                    LoD x_lod,
+                    bool has_bias,
+                    int out_size)
+      : TestCase(place, alias),
+        x_dims_(x_dims),
+        w_dims_(w_dims),
+        b_dims_(b_dims),
+        x_lod_(x_lod),
+        has_bias_(has_bias),
+        out_size_(out_size) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto w = scope->FindTensor(w_);
+    CHECK(x);
+    CHECK(w);
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+
+    const auto x_data = x->data<float>();
+    const auto w_data = w->data<float>();
+    const auto x_dims = x->dims();
+    const auto w_dims = w->dims();
+    const auto& x_lod = x->lod();
+    CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+    CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+    const auto& x_lod_0 = x_lod[0];
+    CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+    CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+        << "The Input(X)'s lod info mismatches the actual tensor shape.";
+    CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+    CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+    CHECK_EQ(w_dims[0], out_size_) << "Wrong shape: w_dims[0] != out_size";
+
+    const float* b_data = nullptr;
+    if (has_bias_) {
+      auto b = scope->FindTensor(b_);
+      CHECK(b);
+      auto b_dims = b->dims();
+      CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+      CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+      b_data = b->data<float>();
+    }
+
+    out->set_lod(x_lod);
+    out->Resize({x_dims[0], w_dims[0]});
+
+    int M = x_dims[0];
+    int K = x_dims[1];
+    int N = w_dims[0];
+    auto out_data = out->mutable_data<float>();
+    basic_gemm<float, float>(false,
+                             true,
+                             M,
+                             N,
+                             K,
+                             1.f,
+                             x_data,
+                             K,
+                             w_data,
+                             K,
+                             0,
+                             out_data,
+                             N,
+                             nullptr,
+                             false,
+                             false);
+    if (b_data != nullptr) {
+      for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+          out_data[i * N + j] += b_data[j];
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("search_seq_fc");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("W", {w_});
+    if (has_bias_) {
+      op_desc->SetInput("b", {b_});
+    }
+    op_desc->SetAttr<bool>("has_bias", has_bias_);
+    op_desc->SetAttr<int>("out_size", out_size_);
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    std::vector<float> w_data(w_dims_.production());
+    fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(w_data.data(), -1.f, 1.f, w_dims_.production());
+    SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_);
+    SetCommonTensor(w_, w_dims_, w_data.data());
+    if (has_bias_) {
+      std::vector<float> b_data(b_dims_.production());
+      fill_data_rand(b_data.data(), -1.f, 1.f, b_dims_.production());
+      SetCommonTensor(b_, b_dims_, b_data.data());
+    }
+  }
+};
+
+void test_search_seq_fc(Place place) {
+  for (auto x_lod_0 : {std::vector<uint64_t>({0, 1, 3}),
+                       std::vector<uint64_t>({0, 3, 4, 5})}) {
+    for (auto feature_size : {2, 9}) {
+      for (auto out_size : {3, 5}) {
+        for (auto has_bias : {true, false}) {
+          DDim x_dims({static_cast<int64_t>(x_lod_0.back()), feature_size});
+          DDim w_dims({out_size, feature_size});
+          DDim b_dims({has_bias ? out_size : 0});
+          LoD x_lod;
+          x_lod.push_back(x_lod_0);
+          std::unique_ptr<arena::TestCase> tester(new SearchSeqFcOPTest(
+              place, "def", x_dims, w_dims, b_dims, x_lod, has_bias, out_size));
+          arena::Arena arena(std::move(tester), place, 6e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+}
+
+TEST(SearchSeqFcOP, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_search_seq_fc(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc
index d0e9912e65de7a0aae10f83c31ba4ab5bbd50890..66123625fae606a9022537698cdc1032abb13451 100644
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
@@ -12,12 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO(zhengxi)
-// shuffle_channel_test can pass on local compilation
-// while on ci compilation, the test will be killed immediately.
-
-/*
-#include <gtest/gtest.h>
+// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine.
+// Open this.
+/*#include <gtest/gtest.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
@@ -30,8 +27,8 @@ class ShuffleChannelComputeTester : public arena::TestCase {
   // common attributes for this op.
   std::string input_ = "X";
   std::string output_ = "Out";
-  int group_ = 1;
-  DDim dims_{{1, 2}};
+  int group_ = 4;
+  DDim dims_{{10, 16, 4, 4}};
 
  public:
   ShuffleChannelComputeTester(const Place& place,
@@ -87,7 +84,7 @@ class ShuffleChannelComputeTester : public arena::TestCase {
 };
 
 void test_shuffle_channel(Place place) {
-  for (int group : {1, 2, 3}) {
+  for (int group : {4}) {
     std::unique_ptr<arena::TestCase> tester(
         new ShuffleChannelComputeTester(place, "def", group));
     arena::Arena arena(std::move(tester), place, 2e-5);
diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc
index 9bbf39b70d5aab67454233efb909f932e0b5bec1..22e475672a87dafee29d68a3824e4f8ac0c15615 100644
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -125,8 +125,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
       for (size_t i = 0; i < axes_.size(); i++) {
         name = name + std::to_string(i);
         axes_tensor_list_.push_back(name);
-        std::vector<int> in_data = {axes_[i]};
-        SetCommonTensor(name, DDim({1}), in_data.data());
+        SetCommonTensor(name, DDim({1}), &axes_[i]);
       }
     }
   }
@@ -230,7 +229,7 @@ void test_unsqueeze(Place place) {
       for (int C : {3}) {
         for (int H : {1}) {
           for (int W : {5}) {
-            for (int input_axes_flag : {1, 2}) {
+            for (int input_axes_flag : {1, 2, 3}) {
               LOG(INFO) << N << " " << C << " " << H << " " << W << " "
                         << input_axes_flag;
               std::unique_ptr<arena::TestCase> tester(
diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt
index 87324375e09cf2633c1ec2a489b9205666754cc1..7dd4f522dbc0f10e8cfb7d19e95da4354ac4b779 100644
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
@@ -1,9 +1,17 @@
 if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(gemv_int8_compute_test SRCS gemv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+
+    if(LITE_BUILD_EXTRA)
+        lite_cc_test(layout_compute_test SRCS layout_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    endif()
+    
+
 endif()
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index bfb74e6e0a6f5ea0cae199f1c7dc5f1c03e83363..bda50d35633c853ba6e8c8695d0175da38865d1c 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,26 +59,30 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
   DDim dim_out = dim_in;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   dim_out[1] = param.filter->dims()[0];
   auto kernel_h = param.filter->dims()[2];
   auto kernel_w = param.filter->dims()[3];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_top = paddings[0];
+  int pad_bottom = paddings[1];
+  int pad_left = paddings[2];
+  int pad_right = paddings[3];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   auto kernel_exten = dila_h * (kernel_h - 1) + 1;
-  auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1;
+  auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1;
   kernel_exten = dila_w * (kernel_w - 1) + 1;
-  auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1;
+  auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1;
   dim_out[2] = hout;
   dim_out[3] = wout;
   return dim_out;
@@ -110,8 +114,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
     param.bias->set_precision(PRECISION(kFloat));
   }
   param.strides = strides;
-  param.paddings = pads;
-  param.dilations = dilas;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.fuse_relu = flag_relu;
   param.groups = group;
 
@@ -162,7 +166,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         param.output->Resize(dim_out);
 
         paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
-        //        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        // paddle::lite::fill_tensor_const(*param.x, 1.f);
         auto din = param.x->data<float>();
 
         Tensor tout_basic;
@@ -189,7 +193,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                                    strides[0],
                                    dilas[1],
                                    dilas[0],
-                                   pads[1],
+                                   pads[2],
                                    pads[0],
                                    flag_bias,
                                    flag_relu);
@@ -201,19 +205,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
                       weight_dim[3] / param.groups;
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -235,7 +239,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 conv: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -280,27 +285,33 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
-      for (auto& pad : {0, 1}) {
-        for (auto& flag_bias : {false, true}) {
-          for (auto& flag_relu : {false, true}) {
-            for (auto& c : {1, 3, 5, 8, 16, 32}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({c, 1, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
-                  dims.push_back(DDim({batch, c, h, h}));
+      for (auto& pad_left : {0, 1, 2}) {
+        for (auto& pad_right : {0, 1, 2}) {
+          for (auto& pad_top : {0, 1, 2}) {
+            for (auto& pad_bottom : {0, 1, 2}) {
+              for (auto& flag_bias : {false, true}) {
+                for (auto& flag_relu : {false, true}) {
+                  for (auto& c : {1, 3, 5, 8, 16, 32}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({c, 1, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
+                        dims.push_back(DDim({batch, c, h, h}));
+                      }
+                    }
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   c,
+                                   {stride, stride},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             c,
-                             {stride, stride},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -329,7 +340,7 @@ TEST(TestConv5x5DW, test_conv5x5_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -366,7 +377,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
                              weights_dim,
                              g,
                              {1, 1},
-                             {0, 0},
+                             {0, 0, 0, 0},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -386,26 +397,32 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32, 48}) {
       for (auto& cout : {1, 5, 8, 32, 48}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_left : {1, 2}) {
+          for (auto& pad_right : {1, 2}) {
+            for (auto& pad_top : {1, 2}) {
+              for (auto& pad_bottom : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   1,
+                                   {1, 1},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             1,
-                             {1, 1},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -420,26 +437,32 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32}) {
       for (auto& cout : {1, 5, 8, 32}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_left : {1, 2}) {
+          for (auto& pad_right : {1, 2}) {
+            for (auto& pad_top : {1, 2}) {
+              for (auto& pad_bottom : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   1,
+                                   {2, 2},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             1,
-                             {2, 2},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -458,30 +481,37 @@ TEST(TestConvRand, test_conv_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cout, cin / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_left : {0, 1, 2}) {
+                  for (auto& pad_right : {0, 1, 2}) {
+                    for (auto& pad_top : {0, 1, 2}) {
+                      for (auto& pad_bottom : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cout, cin / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_fp32(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_top, pad_bottom, pad_left, pad_right},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 2, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_fp32(dims,
-                                       weights_dim,
-                                       g,
-                                       {stride, stride},
-                                       {pad, pad},
-                                       {dila, dila},
-                                       flag_bias,
-                                       flag_relu,
-                                       {1, 2, 4},
-                                       {FLAGS_power_mode});
                       }
                     }
                   }
@@ -510,7 +540,7 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc
index e15b7d22bc2a5859db73f21aa54b1bcdaabf4d2c..27c186d7ceffcaab3019cedf7c281c524be73e44 100644
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,26 +59,26 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   DDim dim_out = dim_in;
   dim_out[1] = param.filter->dims()[0];
   auto kernel_h = param.filter->dims()[2];
   auto kernel_w = param.filter->dims()[3];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   auto kernel_exten = dila_h * (kernel_h - 1) + 1;
-  auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1;
+  auto hout = (h + paddings[0] + paddings[1] - kernel_exten) / stride_h + 1;
   kernel_exten = dila_w * (kernel_w - 1) + 1;
-  auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1;
+  auto wout = (w + paddings[2] + paddings[3] - kernel_exten) / stride_w + 1;
   dim_out[2] = hout;
   dim_out[3] = wout;
   return dim_out;
@@ -104,8 +104,8 @@ void get_conv_param(const DDim& dim_w,
     param->bias->set_precision(PRECISION(kFloat));
   }
   param->strides = strides;
-  param->paddings = pads;
-  param->dilations = dila;
+  param->paddings = std::make_shared<std::vector<int>>(pads);
+  param->dilations = std::make_shared<std::vector<int>>(dila);
   param->fuse_relu = flag_relu;
   param->groups = g;
 
@@ -288,7 +288,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                                    strides[0],
                                    dilas[1],
                                    dilas[0],
-                                   pads[1],
+                                   pads[2],
                                    pads[0],
                                    flag_bias,
                                    flag_relu);
@@ -309,30 +309,30 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         /// compute fp32 output
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_fp32.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compute int8 output
-        t0.clear();
+        t0.Reset();
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_int8.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compare result fp32 output
         if (FLAGS_check_result) {
@@ -358,7 +358,8 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -416,7 +417,8 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -428,9 +430,9 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         }
         LOG(INFO) << "test int8 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
@@ -473,7 +475,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -507,7 +509,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -544,7 +546,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
                              weights_dim,
                              g,
                              {1, 1},
-                             {0, 0},
+                             {0, 0, 0, 0},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -564,26 +566,32 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32, 48}) {
       for (auto& cout : {1, 5, 8, 32, 48}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_top : {1, 2}) {
+          for (auto& pad_bottom : {1, 2}) {
+            for (auto& pad_left : {1, 2}) {
+              for (auto& pad_right : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_int8(dims,
+                                   weights_dim,
+                                   1,
+                                   {1, 1},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_int8(dims,
-                             weights_dim,
-                             1,
-                             {1, 1},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -598,26 +606,32 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32}) {
       for (auto& cout : {1, 5, 8, 32}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_top : {1, 2}) {
+          for (auto& pad_bottom : {1, 2}) {
+            for (auto& pad_left : {1, 2}) {
+              for (auto& pad_right : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_int8(dims,
+                                   weights_dim,
+                                   1,
+                                   {2, 2},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_int8(dims,
-                             weights_dim,
-                             1,
-                             {2, 2},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -636,30 +650,37 @@ TEST(TestConvRandInt8, test_conv_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cout, cin / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_top : {0, 1, 2}) {
+                  for (auto& pad_bottom : {0, 1, 2}) {
+                    for (auto& pad_left : {0, 1, 2}) {
+                      for (auto& pad_right : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cout, cin / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_int8(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_top, pad_bottom, pad_left, pad_right},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 2, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_int8(dims,
-                                       weights_dim,
-                                       g,
-                                       {stride, stride},
-                                       {pad, pad},
-                                       {dila, dila},
-                                       flag_bias,
-                                       flag_relu,
-                                       {1, 2, 4},
-                                       {FLAGS_power_mode});
                       }
                     }
                   }
@@ -688,7 +709,7 @@ TEST(TestConvCustomInt8, test_conv_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc
index e0da07a53462cf902107efc0b6daaeef819f3288..398e745d94bfa71aa8fa2ced227b7add8b24087e 100644
--- a/lite/tests/math/conv_transpose_compute_test.cc
+++ b/lite/tests/math/conv_transpose_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_transpose_compute.h"
@@ -59,17 +59,19 @@ DEFINE_bool(flag_bias, false, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
   auto filter_dims = param.filter->dims();
   DDim output_shape = dim_in;
   output_shape[1] = filter_dims[1] * param.groups;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   for (int i = 0; i < 2; i++) {
-    int kernel_extent = param.dilations[i] * (filter_dims[i + 2] - 1) + 1;
+    int kernel_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     int output_len = (dim_in[i + 2] - 1) * param.strides[i] + kernel_extent -
-                     2 * param.paddings[i];
+                     (paddings[2 * i] + paddings[2 * i + 1]);
     output_shape[i + 2] = output_len;
   }
   return output_shape;
@@ -101,19 +103,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
     param.bias->set_precision(PRECISION(kFloat));
   }
   param.strides = strides;
-  param.paddings = pads;
-  param.dilations = dilas;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.fuse_relu = flag_relu;
   param.groups = group;
 
   param.output = new Tensor;
   param.output->set_precision(PRECISION(kFloat));
 
-  //  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
-  paddle::lite::fill_tensor_const(*param.filter, 1.f);
+  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
+  // paddle::lite::fill_tensor_const(*param.filter, 1.f);
   if (flag_bias) {
-    //    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
-    paddle::lite::fill_tensor_const(*param.bias, 1.f);
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+    // paddle::lite::fill_tensor_const(*param.bias, 1.f);
   }
   Tensor tmp_weights;
   tmp_weights.Resize(weight_dim);
@@ -128,21 +130,8 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
           new paddle::lite::KernelContext);
       auto& ctx = ctx1->As<paddle::lite::ARMContext>();
       ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
-      /// set param and context
-      for (auto& dim_in : input_dims) {
-        param.x->Resize(dim_in);
-        DDim out_tmp_dims = compute_out_dim(dim_in, param);
-        if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
-          continue;
-        }
-        param.output->Resize(out_tmp_dims);
-        break;
-      }
       conv_t.SetParam(param);
       conv_t.SetContext(std::move(ctx1));
-      /// prepare for run
-      conv_t.PrepareForRun();
-
       for (auto& dim_in : input_dims) {
         CHECK_EQ(weight_dim[0], dim_in[1])
             << "input channel must equal to weights channel";
@@ -152,9 +141,11 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         }
         param.x->Resize(dim_in);
         param.output->Resize(dim_out);
-
-        //        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
-        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        param.filter->CopyDataFrom(tmp_weights);
+        // prepare for run
+        conv_t.PrepareForRun();
+        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+        // paddle::lite::fill_tensor_const(*param.x, 1.f);
         auto din = param.x->data<float>();
 
         Tensor tout_basic;
@@ -182,8 +173,10 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
                                      strides[0],
                                      dilas[1],
                                      dilas[0],
-                                     pads[1],
+                                     pads[2],
+                                     pads[3],
                                      pads[0],
+                                     pads[1],
                                      flag_bias,
                                      flag_relu);
         }
@@ -194,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_t.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         float gops =
             2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3];
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -228,7 +221,8 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 conv: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -240,9 +234,9 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         }
         LOG(INFO) << "test fp32 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
@@ -278,30 +272,37 @@ TEST(TestConvRand, test_conv_transpose_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cin, cout / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_h0 : {0, 1, 2}) {
+                  for (auto& pad_h1 : {0, 1, 2}) {
+                    for (auto& pad_w0 : {0, 1, 2}) {
+                      for (auto& pad_w1 : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cin, cout / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_transpose_fp32(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_h0, pad_h1, pad_w0, pad_w1},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_transpose_fp32(dims,
-                                                 weights_dim,
-                                                 g,
-                                                 {stride, stride},
-                                                 {pad, pad},
-                                                 {dila, dila},
-                                                 flag_bias,
-                                                 flag_relu,
-                                                 {1, 2, 4},
-                                                 {FLAGS_power_mode});
                       }
                     }
                   }
@@ -330,7 +331,7 @@ TEST(TestConvCustom, test_conv_transpose_fp32_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index 06a1a0a65e1e5d0abb4a3eef2a6bf7d1e7ce5db0..fde5aacb1c1c21810c06a51eb6fa1f0cc4c3307a 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra,
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias_int8,
@@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_int8.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias,
@@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_fp32.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index c64e78d66a4193f1b20c525120d8b0281afc9a9c..623615c8da16326da3c233687915935aa5a88d64 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -165,7 +165,7 @@ bool test_gemv_int8(
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_fp32,
@@ -177,21 +177,21 @@ bool test_gemv_int8(
                                        dbias,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_int8,
@@ -203,15 +203,15 @@ bool test_gemv_int8(
                                        dbias_int8,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/layout_compute_test.cc b/lite/tests/math/layout_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a566924548d6f3adac805eb80a574a9cd5c2afbf
--- /dev/null
+++ b/lite/tests/math/layout_compute_test.cc
@@ -0,0 +1,608 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+#ifdef LITE_WITH_ARM
+#include "lite/kernels/arm/layout_compute.h"
+#endif  // LITE_WITH_ARM
+
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(batch, 1, "batch size");
+DEFINE_int32(in_channel, 32, "input channel");
+DEFINE_int32(in_height, 112, "input height");
+DEFINE_int32(in_width, 112, "input width");
+
+DEFINE_bool(flag_nchw, true, "do nchw to nhwc");
+
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::operators::LayoutParam LayoutParam;
+
+using paddle::lite::profile::Timer;
+
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+
+template <typename Dtype>
+void nchw2nhwc_ref(const Tensor* input, Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+#undef IN
+#undef OUT
+
+#define IN(n, h, w, c)                                 \
+  input_data[c + w * input_c + h * input_w * input_c + \
+             n * input_h * input_w * input_c]
+#define OUT(n, h, w, c)                                    \
+  output_data[c + w * output_c + h * output_w * output_c + \
+              n * output_h * output_w * output_c]
+template <typename Dtype>
+void nhwc2nchw_ref(const Tensor* input, Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_h = input->dims()[1];
+  int input_w = input->dims()[2];
+  int input_c = input->dims()[3];
+  int output_h = output->dims()[1];
+  int output_w = output->dims()[2];
+  int output_c = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, c, h, w) = IN(n, h, w, c);
+        }
+      }
+    }
+  }
+}
+
+#ifdef LITE_WITH_ARM
+void test_layout_fp32_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kFloat));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kFloat));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kFloat)> layout;
+      DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(
+          *(const_cast<Tensor*>(param.x)), -1.f, 1.f);
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<float>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kFloat));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<float>();
+        nchw2nhwc_ref<float>(param.x, &tout_basic);
+      }
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kFloat));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test fp32 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test fp32 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_fp32_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kFloat));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kFloat));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kFloat)> layout;
+      // n h w c == n c h w
+      DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(
+          *(const_cast<Tensor*>(param.x)), -1.f, 1.f);
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<float>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kFloat));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<float>();
+        nhwc2nchw_ref<float>(param.x, &tout_basic);
+      }
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kFloat));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test fp32 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test fp32 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_int8_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kInt8));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kInt8));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kInt8)> layout;
+      DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(*(const_cast<Tensor*>(param.x)));
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<int8_t>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kInt8));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0);
+        auto dout_basic = tout_basic.mutable_data<int8_t>();
+        nchw2nhwc_ref<int8_t>(param.x, &tout_basic);
+      }
+      LOG(INFO) << "saber compute";
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      LOG(INFO) << "saber compute end";
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kInt8));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test int8 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test int8 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_int8_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kInt8));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kInt8));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kInt8)> layout;
+      // n h w c == n c h w
+      DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(*(const_cast<Tensor*>(param.x)));
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<int8_t>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kInt8));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<int8_t>();
+        nhwc2nchw_ref<int8_t>(param.x, &tout_basic);
+      }
+      LOG(INFO) << "saber compute";
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      LOG(INFO) << "run";
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kInt8));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test int8 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test int8 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+#else
+void test_layout_fp32_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_fp32_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_int8_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_int8_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+#endif  // LITE_WITH_ARM
+
+#if 1  //
+TEST(TestLayout, test_Layout_fp32) {
+  if (FLAGS_basic_test) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 5, 32}) {
+        for (auto h : {3, 16, 20, 32}) {
+          for (auto w : {3, 4, 32, 112}) {
+            for (auto nchw2nhwc : {true, false}) {
+              DDim dim_in({n, c, h, w});
+              if (nchw2nhwc) {
+                LOG(INFO) << "NCHW2NHWC";
+                test_layout_fp32_nchw(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              } else {
+                LOG(INFO) << "NHWC2NCHW";
+                test_layout_fp32_nhwc(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestLayout, test_Layout_int8) {
+  if (FLAGS_basic_test) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 5, 32}) {
+        for (auto h : {3, 16, 20, 32}) {
+          for (auto w : {3, 4, 32, 112}) {
+            for (auto nchw2nhwc : {true, false}) {
+              DDim dim_in({n, c, h, w});
+              if (nchw2nhwc) {
+                LOG(INFO) << "NCHW2NHWC int8";
+                test_layout_int8_nchw(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              } else {
+                LOG(INFO) << "NHWC2NCHW int8";
+                test_layout_int8_nhwc(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+#if 1  /// custom
+TEST(TestLayoutCustom, test_Layout_custom_size) {
+  test_layout_fp32_nchw(
+      {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
+      true,
+      {FLAGS_threads},
+      {FLAGS_power_mode});
+}
+#endif  // custom
diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc
index 9f4a9435945f8478a9285a56f03b20e941b3f8d7..73a5ba5606c2635c2df2792a3ccb6544715384a9 100644
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/pool_compute.h"
@@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::PoolParam PoolParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::PoolParam& param) {
@@ -69,8 +69,7 @@ DDim compute_out_dim(const DDim& dim_in,
   auto kernel_w = param.ksize[1];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   bool ceil_mode = param.ceil_mode;
@@ -79,11 +78,15 @@ DDim compute_out_dim(const DDim& dim_in,
   int wout = 1;
   if (!flag_global) {
     if (!ceil_mode) {
-      hout = (h - kernel_h + 2 * pad_h) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w) / stride_w + 1;
+      hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
+      wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
     } else {
-      hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1;
+      hout =
+          (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
+          1;
+      wout =
+          (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
+          1;
     }
   }
   dim_out[2] = hout;
@@ -116,7 +119,7 @@ void pooling_basic(const float* din,
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -195,18 +198,22 @@ void pooling_basic(const float* din,
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -243,7 +250,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
   param.ksize = ksize;
 
   param.strides = strides;
-  param.paddings = pads;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
   param.ceil_mode = ceil_mode;
   param.global_pooling = flag_global;
   param.pooling_type = pooling_type;
@@ -313,18 +320,18 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           pool.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1];
         LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ", running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ", running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -399,31 +406,38 @@ TEST(TestPoolRand, test_pool_rand) {
       for (auto& kw : {1, 2, 3}) {
         for (auto& kh : {1, 2, 3}) {
           for (auto& stride : {1, 2}) {
-            for (auto& pad : {0, 1, 2}) {
-              for (auto& flag_global : {false, true}) {
-                for (auto& exclusive : {false, true}) {
-                  for (auto& ceil_mode : {false, true}) {
-                    for (auto& pooling_type : {"max", "avg"}) {
-                      bool adaptive = false;
-                      bool use_quantizer = false;
-                      std::vector<DDim> dims;
-                      for (auto& batch : {1, 2}) {
-                        for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
-                          dims.push_back(DDim({batch, cin, h, h}));
+            for (auto& pad_top : {0, 1, 2}) {
+              for (auto& pad_bottom : {0, 1, 2}) {
+                for (auto& pad_left : {0, 1, 2}) {
+                  for (auto& pad_right : {0, 1, 2}) {
+                    for (auto& flag_global : {false, true}) {
+                      for (auto& exclusive : {false, true}) {
+                        for (auto& ceil_mode : {false, true}) {
+                          for (auto& pooling_type : {"max", "avg"}) {
+                            bool adaptive = false;
+                            bool use_quantizer = false;
+                            std::vector<DDim> dims;
+                            for (auto& batch : {1, 2}) {
+                              for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
+                                dims.push_back(DDim({batch, cin, h, h}));
+                              }
+                            }
+                            test_pool_fp32(
+                                dims,
+                                {kh, kw},
+                                {stride, stride},
+                                {pad_top, pad_bottom, pad_left, pad_right},
+                                ceil_mode,
+                                flag_global,
+                                exclusive,
+                                adaptive,
+                                use_quantizer,
+                                pooling_type,
+                                {1, 2, 4},
+                                {FLAGS_power_mode});
+                          }
                         }
                       }
-                      test_pool_fp32(dims,
-                                     {kh, kw},
-                                     {stride, stride},
-                                     {pad, pad},
-                                     ceil_mode,
-                                     flag_global,
-                                     exclusive,
-                                     adaptive,
-                                     use_quantizer,
-                                     pooling_type,
-                                     {1, 2, 4},
-                                     {FLAGS_power_mode});
                     }
                   }
                 }
@@ -443,7 +457,7 @@ TEST(TesPoolCustom, test_pool_fp32_custom_size) {
       {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
       {FLAGS_kernel_h, FLAGS_kernel_w},
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       FLAGS_ceil_mode,
       FLAGS_flag_global,
       FLAGS_exclusive,
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..886dba6ac5a390c5eca4a9b499bfb57e2b077a32
--- /dev/null
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#ifdef LITE_WITH_ARM
+#include "lite/backends/arm/math/funcs.h"
+#endif  // LITE_WITH_ARM
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+using paddle::lite::profile::Timer;
+
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(M, 512, "gemm_c4: M");
+DEFINE_int32(N, 512, "gemm_c4: N");
+DEFINE_int32(K, 512, "gemm_c4: K");
+
+DEFINE_bool(flag_relu, false, "do relu");
+DEFINE_bool(flag_bias, false, "with bias");
+
+bool test_sgemm_c4(
+    int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) {
+  int m_round = (m + 3) / 4 * 4;
+  int k_round = (k + 3) / 4 * 4;
+  int size_a = m * k;
+  int size_b = n * k;
+  int size_a_c4 = m_round * k_round;
+  int size_b_c4 = k_round * n;
+
+  Tensor ta;
+  Tensor tb;
+  Tensor ta_c4;
+  Tensor tb_c4;
+  Tensor tc;
+  Tensor tc_basic;
+  Tensor tc_backup;
+  Tensor tbias;
+
+  ta.Resize({size_a});
+  tb.Resize({size_b});
+  ta_c4.Resize({size_a_c4});
+  tb_c4.Resize({size_b_c4});
+  tc.Resize({m_round * n});
+  tc_basic.Resize({m_round * n});
+  tbias.Resize({m});
+
+  ta.set_precision(PRECISION(kFloat));
+  tb.set_precision(PRECISION(kFloat));
+  ta_c4.set_precision(PRECISION(kFloat));
+  tb_c4.set_precision(PRECISION(kFloat));
+  tc.set_precision(PRECISION(kFloat));
+  tc_basic.set_precision(PRECISION(kFloat));
+  tbias.set_precision(PRECISION(kFloat));
+
+  fill_tensor_rand(ta, -1.f, 1.f);
+  fill_tensor_rand(tb, -1.f, 1.f);
+  fill_tensor_rand(tbias, -1.f, 1.f);
+  fill_tensor_rand(tc, -1.f, 1.f);
+
+  auto da = ta.mutable_data<float>();
+  auto db = tb.mutable_data<float>();
+  auto da_c4 = ta_c4.mutable_data<float>();
+  auto db_c4 = tb_c4.mutable_data<float>();
+  auto dc_basic = tc_basic.mutable_data<float>();
+  auto dbias = tbias.mutable_data<float>();
+
+  // trans A, B to c4
+  basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
+  basic_trans_mat_to_c4(db, db_c4, n, k, n, false);
+
+  LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+
+  if (FLAGS_check_result) {
+    basic_gemm_c4(false,
+                  false,
+                  m,
+                  n,
+                  k,
+                  1.f,
+                  da,
+                  k,
+                  db,
+                  n,
+                  0.f,
+                  dc_basic,
+                  n,
+                  dbias,
+                  has_bias,
+                  has_relu);
+  }
+  Timer t0;
+#ifdef LITE_WITH_ARM
+  //! compute
+  double ops = 2.0 * m_round * n * k_round;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
+  auto dc = tc.mutable_data<float>();
+  for (int j = 0; j < FLAGS_warmup; ++j) {
+    paddle::lite::arm::math::sgemm_prepack_c4(
+        m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
+  }
+
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    t0.Start();
+    paddle::lite::arm::math::sgemm_prepack_c4(
+        m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
+    t0.Stop();
+  }
+  LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
+            << ", power_mode: " << cls << ", threads: " << ths
+            << ", GOPS: " << ops * 1e-9f
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
+            << " GOPs";
+
+  if (FLAGS_check_result) {
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(tc_basic, tc, max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff
+              << ", max ratio: " << max_ratio;
+    if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
+      Tensor tdiff;
+      tdiff.set_precision(PRECISION(kFloat));
+      tdiff.Resize(tc.dims());
+      tensor_diff(tc_basic, tc, tdiff);
+      LOG(INFO) << "a: ";
+      print_tensor(ta);
+      LOG(INFO) << "a_c4: ";
+      print_tensor(ta_c4);
+      LOG(INFO) << "b: ";
+      print_tensor(tb);
+      LOG(INFO) << "b_c4: ";
+      print_tensor(tb_c4);
+      LOG(INFO) << "basic result: ";
+      print_tensor(tc_basic);
+      LOG(INFO) << "lite result: ";
+      print_tensor(tc);
+      LOG(INFO) << "diff result: ";
+      print_tensor(tdiff);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
+
+TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
+  if (FLAGS_basic_test) {
+#ifdef LITE_WITH_ARM
+    paddle::lite::DeviceInfo::Init();
+#endif
+    LOG(INFO) << "run basic sgemm_c4 test";
+    for (auto& m : {1, 3, 8, 32, 397}) {
+      for (auto& n : {1, 2, 3, 4, 13, 141, 789}) {
+        for (auto& k : {1, 3, 8, 59, 234}) {
+          for (auto& has_bias : {false, true}) {
+            for (auto& has_relu : {false, true}) {
+              for (auto& th : {1, 2, 4}) {
+                auto flag = test_sgemm_c4(
+                    m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
+                if (flag) {
+                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
+                            << ", bias: " << (has_bias ? "true" : "false")
+                            << ", relu: " << (has_relu ? "true" : "false")
+                            << " passed\n";
+                } else {
+                  LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k
+                             << ", bias: " << (has_bias ? "true" : "false")
+                             << ", relu: " << (has_relu ? "true" : "false")
+                             << " failed\n";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  auto flag = test_sgemm_c4(FLAGS_M,
+                            FLAGS_N,
+                            FLAGS_K,
+                            FLAGS_flag_bias,
+                            FLAGS_flag_relu,
+                            FLAGS_power_mode,
+                            FLAGS_threads);
+  if (!flag) {
+    LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N
+               << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias
+               << ", relu: " << FLAGS_flag_relu << " failed!!";
+  }
+  LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K
+            << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu
+            << " passed!!";
+}
diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc
index 1621ceb9047125d0d2a4141a01111eb54892dee9..6df5e671fe5138ab6b6ac5941604b9b91759a661 100644
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -171,7 +171,7 @@ bool test_sgemm(bool tra,
     if (i == FLAGS_repeats - 1) {
       memcpy(dc, dc_backup, sizeof(float) * m * ldc);
     }
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::sgemm_prepack(trb,
                                            m,
                                            n,
@@ -186,15 +186,15 @@ bool test_sgemm(bool tra,
                                            has_bias,
                                            has_relu,
                                            &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dd2d322955d2c628366075a6dddb31bed2338ee
--- /dev/null
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#ifdef LITE_WITH_ARM
+#include "lite/backends/arm/math/funcs.h"
+#endif  // LITE_WITH_ARM
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+
+DEFINE_int32(cluster, 3, "cluster id");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, true, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(M, 512, "sgemv: M");
+DEFINE_int32(K, 512, "sgemv: K");
+
+DEFINE_bool(traA, false, "gemv: A transpose");
+
+DEFINE_bool(flag_relu, false, "do relu");
+DEFINE_bool(flag_bias, false, "with bias");
+
+bool test_sgemv(
+    bool tra, int m, int k, bool has_bias, bool has_relu, int cls, int ths) {
+  Tensor ta;
+  Tensor tb;
+  Tensor tc;
+  Tensor tc_basic;
+  Tensor tbias;
+
+  ta.Resize({m, k});
+  tb.Resize({k, 1});
+  tc.Resize({m, 1});
+  tc_basic.Resize({m, 1});
+  tbias.Resize({m});
+
+  ta.set_precision(PRECISION(kFloat));
+  tb.set_precision(PRECISION(kFloat));
+  tc.set_precision(PRECISION(kFloat));
+  tc_basic.set_precision(PRECISION(kFloat));
+  tbias.set_precision(PRECISION(kFloat));
+
+  fill_tensor_rand(ta, -1.f, 1.f);
+  // fill_tensor_const(ta, 1.f);
+  fill_tensor_rand(tb, -1.f, 1.f);
+  // fill_tensor_const(tb, 1.f);
+  fill_tensor_rand(tbias, -1.f, 1.f);
+
+  LOG(INFO) << "sgemv M: " << m << ", K: " << k
+            << ", transA: " << (tra ? "true" : "false")
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+#ifdef LITE_WITH_ARM
+
+  auto da = ta.mutable_data<float>();
+  auto db = tb.mutable_data<float>();
+  auto dc = tc.mutable_data<float>();
+  auto dc_basic = tc_basic.mutable_data<float>();
+  auto dbias = tbias.mutable_data<float>();
+
+  if (FLAGS_check_result) {
+    basic_gemv(
+        m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu);
+  }
+  paddle::lite::profile::Timer t0;
+  //! compute
+  double ops = 2.0 * m * k;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
+  /// warmup
+  for (int j = 0; j < FLAGS_warmup; ++j) {
+    paddle::lite::arm::math::sgemv(
+        da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
+  }
+
+  t0.Reset();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    t0.Start();
+    paddle::lite::arm::math::sgemv(
+        da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
+    t0.Stop();
+  }
+  LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls
+            << ", threads: " << ths << ", GOPS: " << ops * 1e-9f
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
+            << " GOPs";
+
+  if (FLAGS_check_result) {
+    double max_ratio = 0;
+    double max_diff = 0;
+    /// fp32 result
+    tensor_cmp_host(tc_basic, tc, max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff
+              << ", max ratio: " << max_ratio;
+    if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
+      Tensor tdiff;
+      tdiff.set_precision(PRECISION(kFloat));
+      tdiff.Resize(tc.dims());
+      tensor_diff(tc_basic, tc, tdiff);
+      LOG(INFO) << "basic result: ";
+      print_tensor(tc_basic);
+      LOG(INFO) << "saber result: ";
+      print_tensor(tc);
+      LOG(INFO) << "diff result: ";
+      print_tensor(tdiff);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
+
+TEST(TestLiteSgemv, Sgemv) {
+  if (FLAGS_basic_test) {
+#ifdef LITE_WITH_ARM
+    paddle::lite::DeviceInfo::Init();
+#endif
+    LOG(INFO) << "run basic sgemv test";
+    for (auto& m : {1, 3, 8, 21, 32, 397}) {
+      for (auto& k : {1, 3, 8, 17, 59, 234}) {
+        for (auto& tra : {true, false}) {
+          for (auto& has_bias : {false, true}) {
+            for (auto& has_relu : {false, true}) {
+              for (auto& th : {1, 2, 4}) {
+                auto flag = test_sgemv(
+                    tra, m, k, has_bias, has_relu, FLAGS_cluster, th);
+                if (flag) {
+                  LOG(INFO) << "test m = " << m << ", k=" << k
+                            << ", bias: " << (has_bias ? "true" : "false")
+                            << ", relu: " << (has_relu ? "true" : "false")
+                            << ", trans A: " << (tra ? "true" : "false")
+                            << ", threads: " << th << " passed\n";
+                } else {
+                  LOG(FATAL) << "test m = " << m << ", k=" << k
+                             << ", bias: " << (has_bias ? "true" : "false")
+                             << ", relu: " << (has_relu ? "true" : "false")
+                             << ", trans A: " << (tra ? "true" : "false")
+                             << ", threads: " << th << " failed\n";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(TestSgemvCustom, Sgemv_custom) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  auto flag = test_sgemv(FLAGS_traA,
+                         FLAGS_M,
+                         FLAGS_K,
+                         FLAGS_flag_bias,
+                         FLAGS_flag_relu,
+                         FLAGS_cluster,
+                         FLAGS_threads);
+  if (!flag) {
+    LOG(FATAL) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
+               << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
+               << ", relu: " << FLAGS_flag_relu << " failed!!";
+  }
+  LOG(INFO) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
+            << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
+            << ", relu: " << FLAGS_flag_relu << " passed!!";
+}
diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h
index 846126ac247ee685bd8772ede87635c45b52f79a..fd868e85acdbdfe39abc0bcbfa50f85db12a50b6 100644
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
@@ -14,6 +14,108 @@
 
 #pragma once
 
+template <typename type>
+static void basic_trans_mat_to_c4(const type* input,
+                                  type* output,
+                                  const int ldin,
+                                  const int M,
+                                  const int K,
+                                  bool pack_k) {
+  const int m_round = (M + 3) / 4 * 4;
+  int k_round = (K + 3) / 4 * 4;
+  if (!pack_k) {
+    k_round = K;
+  }
+  const int m_loop = m_round / 4;
+  type zero_buf[K];
+  memset(zero_buf, 0, K * sizeof(type));
+  for (int i = 0; i < m_loop; ++i) {
+    const type* in0 = input + i * 4 * ldin;
+    const type* in1 = in0 + ldin;
+    const type* in2 = in1 + ldin;
+    const type* in3 = in2 + ldin;
+    if (4 * (i + 1) - M > 0) {
+      switch (4 * (i + 1) - M) {
+        case 3:
+          in1 = zero_buf;
+        case 2:
+          in2 = zero_buf;
+        case 1:
+          in3 = zero_buf;
+        default:
+          break;
+      }
+    }
+    for (int j = 0; j < K; ++j) {
+      *output++ = *in0++;
+      *output++ = *in1++;
+      *output++ = *in2++;
+      *output++ = *in3++;
+    }
+    for (int j = K; j < k_round; ++j) {
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+    }
+  }
+}
+
+template <typename type, typename type2>
+static void basic_gemm_c4(bool trans_a,
+                          bool trans_b,
+                          int m,
+                          int n,
+                          int k,
+                          type2 alpha,
+                          const type* a,
+                          int lda,
+                          const type* b,
+                          int ldb,
+                          type2 beta,
+                          type2* c,
+                          int ldc,
+                          const type2* bias,
+                          bool flag_bias = false,
+                          bool flag_relu = false) {
+  type2* tmp_c = reinterpret_cast<type2*>(malloc(m * ldc * sizeof(type2)));
+  memset(tmp_c, 0, m * ldc * sizeof(type2));
+#pragma omp parallel for
+  for (int i = 0; i < m; ++i) {
+    auto bias_data = static_cast<type2>(0);
+    if (flag_bias) {
+      bias_data = bias[i];
+    }
+    for (int j = 0; j < n; ++j) {
+      auto sum = static_cast<type2>(0);
+      for (int l = 0; l < k; ++l) {
+        type av;
+        type bv;
+        if (trans_a) {
+          av = a[l * lda + i];
+        } else {
+          av = a[i * lda + l];
+        }
+        if (trans_b) {
+          bv = b[j * ldb + l];
+        } else {
+          bv = b[l * ldb + j];
+        }
+        sum += av * bv;
+      }
+      type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data;
+      if (flag_relu) {
+        tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0;
+      } else {
+        tmp_c[i * ldc + j] = tmp;
+      }
+    }
+  }
+  //! trans c to c4
+  basic_trans_mat_to_c4(tmp_c, c, ldc, m, n, false);
+  free(tmp_c);
+}
+
 template <typename type, typename type2>
 static void basic_gemm(bool trans_a,
                        bool trans_b,
@@ -228,8 +330,10 @@ static void col2im(const Dtype* data_col,
                    const int width,
                    const int kernel_h,
                    const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                    const int stride_h,
                    const int stride_w,
                    const int dilation_h,
@@ -237,21 +341,24 @@ static void col2im(const Dtype* data_col,
                    Dtype* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(Dtype));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
 
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
 
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
 
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
@@ -289,8 +396,10 @@ void deconv_basic(const Dtype1* din,
                   int stride_h,
                   int dila_w,
                   int dila_h,
-                  int pad_w,
-                  int pad_h,
+                  int pad_w0,
+                  int pad_w1,
+                  int pad_h0,
+                  int pad_h1,
                   bool flag_bias,
                   bool flag_relu) {
   int m = chout * kernel_w * kernel_h / group;
@@ -302,8 +411,9 @@ void deconv_basic(const Dtype1* din,
   int group_size_coldata = m * n;
   int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
   bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
+                      (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) &&
+                      (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) &&
+                      (dila_h == 1);
 
   Dtype2* workspace_ptr =
       static_cast<Dtype2*>(malloc(sizeof(float) * m * n * group));
@@ -316,7 +426,7 @@ void deconv_basic(const Dtype1* din,
     if (flag_1x1s1p1) {
       col_data = dout_batch;
     }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
+    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group);
     for (int g = 0; g < group; ++g) {
       const Dtype1* din_group = din_batch + g * group_size_in;
       const Dtype1* weights_group = weights + g * group_size_weights;
@@ -346,8 +456,10 @@ void deconv_basic(const Dtype1* din,
              wout,
              kernel_h,
              kernel_w,
-             pad_h,
-             pad_w,
+             pad_h0,
+             pad_h1,
+             pad_w0,
+             pad_w1,
              stride_h,
              stride_w,
              dila_h,
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 4873e70773f31425d628ee2bbdd36f2cb2f921f1..319f26ff82dd47718a7fc69d64522ca622ecaf3e 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -20,6 +20,7 @@ BUILD_DIR=$(pwd)
 OPTMODEL_DIR=""
 BUILD_TAILOR=OFF
 BUILD_CV=OFF
+SHUTDOWN_LOG=ON
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 
@@ -93,7 +94,7 @@ function make_tiny_publish_so {
       -DWITH_TESTING=OFF \
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
-      -DLITE_SHUTDOWN_LOG=ON \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
       -DLITE_ON_TINY_PUBLISH=ON \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -136,7 +137,7 @@ function make_full_publish_so {
       -DWITH_TESTING=OFF \
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
-      -DLITE_SHUTDOWN_LOG=ON \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_CV=$BUILD_CV \
@@ -236,10 +237,10 @@ function make_cuda {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
-            -DLITE_WITH_PYTHON=ON \
+            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON
-
-  make publish_inference_python_lib -j8
+ 
+  make publish_inference -j4
   cd -
 }
 
@@ -290,6 +291,7 @@ function print_usage {
     echo -e "   ./build.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang> test"
     echo
     echo -e "optional argument:"
+    echo -e "--shutdown_log: (OFF|ON); controls whether to shutdown log, default is ON"
     echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
     echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)"
@@ -366,6 +368,10 @@ function main {
                 BUILD_TAILOR="${i#*=}"
                 shift
                 ;;
+            --shutdown_log=*)
+                SHUTDOWN_LOG="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh
index 03a74046f17ad03bccc7b6d5050acae9d643686c..1509f563b2e4f2008e7ea4f37ca4e5491464e9cc 100755
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -5,8 +5,8 @@ set -ex
 ARM_OS="android"                    # android only yet
 ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
-ANDROID_STL="c++_static"            # c++_shared, c++_static
-DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/
+ANDROID_STL="c++_shared"            # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
+DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
 TARGET_NAME="test_npu_pass"         # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 8be8e6e6b6da1e2aa38b6fcbcf95b23a8543a5be..8b5741a7a68bee3e783dff68e4bd4a8fc7cd8527 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -1,9 +1,10 @@
 #!/bin/bash
+# The git version of CI is 2.7.4. This script is not compatible with git version 1.7.1.
 set -ex
 
 TESTS_FILE="./lite_tests.txt"
 LIBS_FILE="./lite_libs.txt"
-
+CUDNN_ROOT="/usr/local/cudnn"
 
 readonly ADB_WORK_DIR="/data/local/tmp"
 readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF"
@@ -162,6 +163,12 @@ function cmake_x86_for_CI {
     # make test_generated_code -j$NUM_CORES_FOR_COMPILE
 }
 
+function cmake_cuda_for_CI {
+    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
+    cmake ..  -DLITE_WITH_CUDA=ON -DWITH_MKLDNN=OFF -DLITE_WITH_X86=OFF ${common_flags} -DLITE_WITH_PROFILE=ON -DWITH_MKL=OFF \
+        -DLITE_BUILD_EXTRA=ON -DCUDNN_ROOT=${CUDNN_ROOT}
+}
+
 function cmake_gpu {
     prepare_workspace
     cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
@@ -195,7 +202,6 @@ function test_server {
     # Due to the missing of x86 kernels, we skip the following tests temporarily.
     # TODO(xxx) clear the skip list latter
     local skip_list=("test_paddle_api" "test_cxx_api"
-                     "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
                      "test_light_api"
                      "test_apis" "test_model_bin"
                     )
@@ -227,6 +233,16 @@ function build_test_server {
     test_model_optimize_tool_compile
 }
 
+# The CUDA version of CI is cuda_10.1.243_418.87.00_linux.
+# The cuDNN version is cudnn-10.1-linux-x64-v7.5.0.56.
+function build_test_cuda_server {
+    mkdir -p ./build
+    cd ./build
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    cmake_cuda_for_CI
+    build
+}
+
 function build_test_train {
     mkdir -p ./build
     cd ./build
@@ -951,6 +967,10 @@ function main {
                 test_arm_android $TEST_NAME $ARM_PORT
                 shift
                 ;;
+            build_test_cuda_server)
+                build_test_cuda_server
+                shift
+                ;;
             build_test_server)
                 build_test_server
                 shift
diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h
index 7f77b90488657aab96c7942d703e86d64723f5fc..ff08c47e524cacee37e95572a7f7a2fb444d4d16 100644
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
@@ -27,7 +27,7 @@
 #include "lite/model_parser/pb/var_desc.h"
 #include "lite/utils/all.h"
 
-DEFINE_string(model_dir, "", "Model dir path");
+DEFINE_string(model_path, "", "Model dir path");
 DEFINE_string(input_file, "", "Input datas file path");
 DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
 DEFINE_bool(output_topo, true, "Dump runtime topology or not");
@@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) {
   CHECK(conf);
 #define CHECK_NON_EMPTY(name__) \
   CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
-  CHECK_NON_EMPTY(model_dir);
+  CHECK_NON_EMPTY(model_path);
   if (FLAGS_output_topo) {
     CHECK_NON_EMPTY(topo_output_file);
   }
@@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) {
     CHECK_NON_EMPTY(tensor_output_file);
   }
 #undef CHECK_NON_EMPTY
-  conf->model_dir = FLAGS_model_dir;
+  conf->model_dir = FLAGS_model_path;
   conf->topo_output_file = FLAGS_topo_output_file;
   conf->tensor_output_file = FLAGS_tensor_output_file;
   conf->input_file = FLAGS_input_file;
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index 0bccfe2804a9ba17473575815bfe4b2e9635f234..f18047556874a82d28c5964a1b5fd2fa8284c814 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src,
                                   int dstw,
                                   int dsth) {
   resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
-  /*
-    int size = srcw * srch;
-    if (srcw == dstw && srch == dsth) {
-      if (srcFormat == NV12 || srcFormat == NV21) {
-        size = srcw * (floor(1.5 * srch));
-      } else if (srcFormat == BGR || srcFormat == RGB) {
-        size = 3 * srcw * srch;
-      } else if (srcFormat == BGRA || srcFormat == RGBA) {
-        size = 4 * srcw * srch;
-      }
-      memcpy(dst, src, sizeof(uint8_t) * size);
-      return;
-    }
-    double scale_x = static_cast<double>(srcw / dstw);
-    double scale_y = static_cast<double>(srch / dsth);
-
-    int* buf = new int[dstw * 2 + dsth * 2];
-
-    int* xofs = buf;
-    int* yofs = buf + dstw;
-    int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-    int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
-
-    compute_xy(
-        srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
-
-    int w_out = dstw;
-    int w_in = srcw;
-    int num = 1;
-    int orih = dsth;
-    if (srcFormat == GRAY) {
-      num = 1;
-    } else if (srcFormat == NV12 || srcFormat == NV21) {
-      num = 1;
-      int hout = static_cast<int>(0.5 * dsth);
-      dsth += hout;
-    } else if (srcFormat == BGR || srcFormat == RGB) {
-      w_in = srcw * 3;
-      w_out = dstw * 3;
-      num = 3;
-
-    } else if (srcFormat == BGRA || srcFormat == RGBA) {
-      w_in = srcw * 4;
-      w_out = dstw * 4;
-      num = 4;
-    }
-
-    int* xofs1 = nullptr;
-    int* yofs1 = nullptr;
-    int16_t* ialpha1 = nullptr;
-    if (orih < dsth) {  // uv
-      int tmp = dsth - orih;
-      int w = dstw / 2;
-      xofs1 = new int[w];
-      yofs1 = new int[tmp];
-      ialpha1 = new int16_t[srcw];
-      compute_xy(srcw / 2,
-                 srch / 2,
-                 w,
-                 tmp,
-                 scale_x,
-                 scale_y,
-                 xofs1,
-                 yofs1,
-                 ialpha1,
-                 ibeta + orih);
-    }
-    int cnt = w_out >> 3;
-    int remain = w_out % 8;
-    int32x4_t _v2 = vdupq_n_s32(2);
-  #pragma omp parallel for
-    for (int dy = 0; dy < dsth; dy++) {
-      int16_t* rowsbuf0 = new int16_t[w_out];
-      int16_t* rowsbuf1 = new int16_t[w_out];
-      int sy = yofs[dy];
-      if (dy >= orih) {
-        xofs = xofs1;
-        yofs = yofs1;
-        ialpha = ialpha1;
-      }
-      if (sy < 0) {
-        memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
-        const uint8_t* S1 = src + srcw * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows1p++ = ((*S1pl++) * a1) >> 4;
-            } else {
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      } else {
-        // hresize two rows
-        const uint8_t* S0 = src + w_in * (sy);
-        const uint8_t* S1 = src + w_in * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows0p = rowsbuf0;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S0pl = S0 + sx;
-          const uint8_t* S0pr = S0 + sx + num;
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S0pl = S0;
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows0p = ((*S0pl++) * a1) >> 4;
-              *rows1p = ((*S1pl++) * a1) >> 4;
-              rows0p++;
-              rows1p++;
-            } else {
-              *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      }
-      int ind = dy * 2;
-      int16_t b0 = ibeta[ind];
-      int16_t b1 = ibeta[ind + 1];
-      int16x8_t _b0 = vdupq_n_s16(b0);
-      int16x8_t _b1 = vdupq_n_s16(b1);
-      uint8_t* dp_ptr = dst + dy * w_out;
-      int16_t* rows0p = rowsbuf0;
-      int16_t* rows1p = rowsbuf1;
-      int re_cnt = cnt;
-      if (re_cnt > 0) {
-  #ifdef __aarch64__
-        asm volatile(
-            "1: \n"
-            "ld1 {v0.8h}, [%[rows0p]], #16 \n"
-            "ld1 {v1.8h}, [%[rows1p]], #16 \n"
-            "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "smull v2.4s, v0.4h, %w[_b0].4h \n"
-            "smull2 v4.4s, v0.8h, %w[_b0].8h \n"
-            "smull v3.4s, v1.4h, %w[_b1].4h \n"
-            "smull2 v5.4s, v1.8h, %w[_b1].8h \n"
-
-            "ssra v6.4s, v2.4s, #16 \n"
-            "ssra v7.4s, v4.4s, #16 \n"
-            "ssra v6.4s, v3.4s, #16 \n"
-            "ssra v7.4s, v5.4s, #16 \n"
-
-            "shrn v0.4h, v6.4s, #2 \n"
-            "shrn2 v0.8h, v7.4s, #2 \n"
-            "subs %w[cnt], %w[cnt], #1 \n"
-            "sqxtun v1.8b, v0.8h \n"
-            "st1 {v1.8b}, [%[dp]], #8 \n"
-            "bne 1b \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-  #else
-        asm volatile(
-            "mov        r4, #2          \n"
-            "vdup.s32   q12, r4         \n"
-            "0:                         \n"
-            "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
-            "vld1.s16   {d6-d7}, [%[rows1p]]!\n"
-            "vorr.s32   q10, q12, q12   \n"
-            "vorr.s32   q11, q12, q12   \n"
-
-            "vmull.s16  q0, d2, %[_b0]     \n"
-            "vmull.s16  q1, d3, %[_b0]     \n"
-            "vmull.s16  q2, d6, %[_b1]     \n"
-            "vmull.s16  q3, d7, %[_b1]     \n"
-
-            "vsra.s32   q10, q0, #16    \n"
-            "vsra.s32   q11, q1, #16    \n"
-            "vsra.s32   q10, q2, #16    \n"
-            "vsra.s32   q11, q3, #16    \n"
-
-            "vshrn.s32  d20, q10, #2    \n"
-            "vshrn.s32  d21, q11, #2    \n"
-            "subs       %[cnt], #1          \n"
-            "vqmovun.s16 d20, q10        \n"
-            "vst1.8     {d20}, [%[dp]]!    \n"
-            "bne        0b              \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1)
-            : "cc",
-              "memory",
-              "r4",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12");
-
-  #endif  // __aarch64__
-      }
-      for (int i = 0; i < remain; i++) {
-        //             D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
-        //             INTER_RESIZE_COEF_BITS;
-        *dp_ptr++ =
-            (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
-                       (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
-                      2);
-      }
-    }
-    delete[] buf;
-    */
 }
 
 void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h
index 11673e19041883bfa6ca7a45f03ca3feca76dd20..5a46a9e48e8202fe29ec9fc7d950ccf15920cc32 100644
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -133,7 +133,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param degree: Rotate degree, support 90, 180 and 270
@@ -158,7 +158,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param flip_param: flip parameter, support X, Y and XY
@@ -190,7 +190,7 @@ class ImagePreprocess {
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param layout: output tensor layout，support NHWC and NCHW
diff --git a/lite/utils/io.h b/lite/utils/io.h
index 98a0f39b084c1ec0767299501f6f359dab2017b3..92405cae862f062090665aecc8eb7f207cf059e7 100644
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -14,9 +14,12 @@
 
 #pragma once
 
+#include <dirent.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <fstream>
 #include <string>
+#include <vector>
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
@@ -46,11 +49,68 @@ static void MkDirRecur(const std::string& path) {
 // read buffer from file
 static std::string ReadFile(const std::string& filename) {
   std::ifstream ifile(filename.c_str());
+  if (!ifile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
   std::ostringstream buf;
   char ch;
   while (buf && ifile.get(ch)) buf.put(ch);
+  ifile.close();
   return buf.str();
 }
 
+// read lines from file
+static std::vector<std::string> ReadLines(const std::string& filename) {
+  std::ifstream ifile(filename.c_str());
+  if (!ifile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
+  std::vector<std::string> res;
+  std::string tmp;
+  while (getline(ifile, tmp)) res.push_back(tmp);
+  ifile.close();
+  return res;
+}
+
+static void WriteLines(const std::vector<std::string>& lines,
+                       const std::string& filename) {
+  std::ofstream ofile(filename.c_str());
+  if (!ofile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
+  for (const auto& line : lines) {
+    ofile << line << "\n";
+  }
+  ofile.close();
+}
+
+static bool IsDir(const std::string& path) {
+  DIR* dir_fd = opendir(path.c_str());
+  if (dir_fd == nullptr) return false;
+  closedir(dir_fd);
+  return true;
+}
+
+static std::vector<std::string> ListDir(const std::string& path,
+                                        bool only_dir = false) {
+  if (!IsDir(path)) {
+    LOG(FATAL) << "[" << path << "] is not a valid dir path.";
+  }
+
+  std::vector<std::string> paths;
+  DIR* parent_dir_fd = opendir(path.c_str());
+  dirent* dp;
+  while ((dp = readdir(parent_dir_fd)) != nullptr) {
+    // Exclude '.', '..' and hidden dir
+    std::string name(dp->d_name);
+    if (name == "." || name == ".." || name[0] == '.') continue;
+    if (IsDir(Join<std::string>({path, name}, "/"))) {
+      paths.push_back(name);
+    }
+  }
+  closedir(parent_dir_fd);
+  return paths;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc
index 6351be95acdb7311f7d5604d9af3cfe8945bc424..e9ee5861baca85966ce53ac1570d7ebc23a002cb 100644
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
@@ -43,10 +43,10 @@ void gen_log(STL::ostream& log_stream_,
   gettimeofday(&tv, NULL);
 
   // print date / time
-  log_stream_ << '[' << level << ' ' << std::setw(2) << 1 + tm_time.tm_mon
-              << '/' << std::setw(2) << tm_time.tm_mday << ' ' << std::setw(2)
-              << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':'
-              << std::setw(2) << tm_time.tm_sec << '.' << std::setw(3)
+  log_stream_ << '[' << level << ' ' << STL::setw(2) << 1 + tm_time.tm_mon
+              << '/' << STL::setw(2) << tm_time.tm_mday << ' ' << STL::setw(2)
+              << tm_time.tm_hour << ':' << STL::setw(2) << tm_time.tm_min << ':'
+              << STL::setw(2) << tm_time.tm_sec << '.' << STL::setw(3)
               << tv.tv_usec / 1000 << " ";
 
   if (len > kMaxLen) {
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index e85753ec301c62152ce484105d6c42ac1b69ab16..c2c999fd70f3eee78c1deaf5ec2c4fea4e4f3fd1 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -30,6 +30,18 @@
 #include <string>
 #include "lite/utils/replace_stl/stream.h"
 
+#ifdef LITE_WITH_ANDROID
+#include <android/log.h>
+// Android log macors
+#define ANDROID_LOG_TAG "Paddle-Lite"
+#define ANDROID_LOG_I(msg) \
+  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, msg)
+#define ANDROID_LOG_W(msg) \
+  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, msg)
+#define ANDROID_LOG_F(msg) \
+  __android_log_print(ANDROID_LOG_FATAL, ANDROID_LOG_TAG, msg)
+#endif
+
 // NOLINTFILE()
 
 // LOG()
@@ -93,11 +105,22 @@ class LogMessage {
              const char* func,
              int lineno,
              const char* level = "I") {
+    level_ = level;
     paddle::lite::gen_log(log_stream_, file, func, lineno, level);
   }
 
   ~LogMessage() {
     log_stream_ << '\n';
+#ifdef LITE_WITH_ANDROID
+    if (level_ == "I") {
+      ANDROID_LOG_I(log_stream_.str().c_str());
+    } else if (level_ == "W") {
+      ANDROID_LOG_W(log_stream_.str().c_str());
+    } else {
+      fprintf(stderr, "Unsupported log level: %s", level_.c_str());
+      assert(false);
+    }
+#endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
   }
 
@@ -105,6 +128,7 @@ class LogMessage {
 
  protected:
   STL::stringstream log_stream_;
+  std::string level_;
 
   LogMessage(const LogMessage&) = delete;
   void operator=(const LogMessage&) = delete;
@@ -121,7 +145,11 @@ class LogMessageFatal : public LogMessage {
 
   ~LogMessageFatal() {
     log_stream_ << '\n';
+#ifdef LITE_WITH_ANDROID
+    ANDROID_LOG_F(log_stream_.str().c_str());
+#endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
+
 #ifndef LITE_ON_TINY_PUBLISH
     abort();
 #else
@@ -152,6 +180,9 @@ class VLogMessage {
       return;
     }
     log_stream_ << '\n';
+#ifdef LITE_WITH_ANDROID
+    ANDROID_LOG_I(log_stream_.str().c_str());
+#endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
   }
 
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index 61999a79e3d9e997b23943e46a419577ee2de44c..d821078e366b1ade8b093e08a63829bcf35c1376 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "lite/utils/replace_stl/stream.h"
+#include <assert.h>
+#include <stdio.h>
 
 #ifdef LITE_ON_TINY_PUBLISH
 
@@ -20,93 +22,119 @@ namespace paddle {
 namespace lite {
 namespace replace_stl {
 
+void ostream::pad(const std::string& text) {
+  if (display_width_ > 0) {
+    if (display_width_ < text.size()) {
+      fprintf(stderr, "Replace STL IO display length less than text\n");
+      assert(false);
+    } else {
+      for (int i = 0; i < display_width_ - text.size(); ++i) {
+        data_.push_back(' ');
+      }
+      display_width_ = -1;
+    }
+  }
+}
+
 #ifdef LITE_SHUTDOWN_LOG
 #define ADD_DATA_AS_STRING(data_, obj_)
 #else
-#define ADD_DATA_AS_STRING(data_, obj_) data_ = data_ + std::to_string(obj_)
+#define ADD_DATA_AS_STRING(data_, obj_)    \
+  std::string text = std::to_string(obj_); \
+  pad(text);                               \
+  data_ = data_ + text;
+
 #endif
 
 template <>
 ostream& ostream::operator<<(const char* obj) {
-  _data = _data + std::string(obj);
+  data_ = data_ + std::string(obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const char& obj) {
-  _data = _data + obj;
+  data_ = data_ + obj;
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const std::string& obj) {
-  _data = _data + obj;
+  data_ = data_ + obj;
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const int16_t& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const int& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const bool& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const long long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const unsigned& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const unsigned long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const unsigned long long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const float& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const double& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const long double& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
+  return *this;
+}
+
+template <>
+ostream& ostream::operator<<(const LiteIoWidth& obj) {
+  int width = obj.width;
+  assert(width > 0);
+  display_width_ = width;
   return *this;
 }
 
diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h
index e6bb261706bd7f25943fd3a6fad1ba97b9dfe3a4..3288a1986906b3fd600b91b6a56ae7134644456f 100644
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
@@ -29,18 +29,25 @@ namespace lite {
 
 namespace replace_stl {
 
+struct LiteIoWidth {
+  explicit LiteIoWidth(int value) : width(value) {}
+  int width;
+};
+
+static LiteIoWidth setw(int width) { return LiteIoWidth(width); }
+
 class ostream {
  public:
   ostream() {}
-  explicit ostream(const std::string& x) : _data(x) {}
+  explicit ostream(const std::string& x) : data_(x) {}
   ~ostream() {}
 
-  const char* c_str() { return _data.c_str(); }
+  const char* c_str() { return data_.c_str(); }
 
-  const std::string& str() { return _data; }
+  const std::string& str() { return data_; }
   const std::string& str(const std::string& x) {
-    _data = x;
-    return _data;
+    data_ = x;
+    return data_;
   }
 
   template <typename T>
@@ -50,7 +57,9 @@ class ostream {
   ostream& operator<<(const T* obj);
 
  private:
-  std::string _data;
+  void pad(const std::string& text);
+  std::string data_;
+  int display_width_{-1};  // -1 refers to no setting
 };
 
 class stringstream : public ostream {
diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp
index f39d012e08c124feacbd72fa2879e60b352c2785..1a90cb5bdc8b0cf96785b59cc37076b2beaa2572 100644
--- a/mobile/src/fpga/V2/api.cpp
+++ b/mobile/src/fpga/V2/api.cpp
@@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
 
     arg->concat_arg.images_in[i] =
         (int8_t *)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.scales_in[i] = out->scale;
     arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
 
     expand_conv_arg(&arg->conv_arg[i]);
diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp
old mode 100644
new mode 100755
index dc3c3356e838c88023d0efa1c40bf6f910aece89..917491c371a4433e212f4b7a74707d7350363821
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
@@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
                     height *
                         align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
                         sizeof(int8_t));
-    for (j = 0;
-         j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-         j++) {
-      images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
-    }
   }
   align_each_out_area_cw =
       align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
@@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
         memcpy(
             (int8_t *)image_out + tmp_channel +  // NOLINT
                 k * align_each_out_area_cw_differ,
-            images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
+            images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
             channel_num[i] * sizeof(int8_t));
 
         tmp_channel += channel_num[i];
@@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
     }
   }
   fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
+  for (i = 0; i < image_num; i++) {
+    fpga_free(images_in_tmp[i]);
+  }
+  fpga_free(images_in_tmp);
 }
 
 void split_image(int8_t *image_in, void **images_out, int image_num,
diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp
old mode 100644
new mode 100755
index aa150e0c6cecbdf278f3d776ebba4ec81ed003a1..a3c179994a2be8dc4a87441febc7e6db4ecd797c
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
@@ -109,7 +109,7 @@ using namespace std;     // NOLINT
 #define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
 #define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
 #define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
+#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880
 #define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
 #define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
 #define REG_POOLING_MODE_RECIPROCAL 0x890
@@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
   // DLOG << "   activation_type:" << active_args.activation_type
   //     << "   leaky_relu_negative_slope:"
   //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
+  DLOG << "   reg_ActivationArgs:";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
     ret = -EIO;
@@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
+    // reg_writeq(reg_ActivationArgs,
+             // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
   // new
   reg_writeq((args.driver.row_padding_down << 45) |
                  (args.driver.row_padding_up << 34) |
@@ -270,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
                  args.driver.filter_pad_width_mul_channel,
              REG_CONV_REG1);
 
-  reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) |
-                 (args.driver.filter_row << 8) |
-                 (args.driver.filter_height << 4) | args.driver.filter_width,
-             REG_CONV_REG2);
+    reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
+               (args.driver.filter_row << 10) |
+               (args.driver.filter_height << 5) | args.driver.filter_width,
+               REG_CONV_REG2);
 
   reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
                  (args.driver.prog_full_cnt << 16) |
@@ -358,7 +362,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
        << "   out_scale_address:" << args.output.scale_address;
 #endif
 #ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Polling";
   // return 0;
   uint64_t output_scale = 0;
   uint64_t timer_cnt = 0;
@@ -366,66 +369,74 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
   uint64_t cmd = 0;
   uint64_t image_physical_address = 0;
   uint64_t output_physical_address = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  //  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-  //  active_args.activation_type = args.output.activation.activation_type;
-
-  //  active_args.leaky_relu_negative_slope =
-  //     args.output.activation.leaky_relu_negative_slope;
-
-  //  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                       active_args.leaky_relu_negative_slope;
-
-  //  DLOG << "   activation_type:" << active_args.activation_type
-  //       << "   leaky_relu_negative_slope:"
-  //       << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  image_physical_address = vaddr_to_paddr_driver(args.image.address);
-  output_physical_address = vaddr_to_paddr_driver(args.output.address);
-  uint32_t output_height = (uint32_t)(
+uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
+  image_physical_address = vaddr_to_paddr(args.image.address);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
+  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
+  uint64_t output_height = (uint64_t)(
       (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
+          args.kernel.stride_h + 1);
+  uint64_t output_width = (uint64_t)(
       (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
+           args.kernel.stride_w + 1);
+
   uint64_t image_amount_per_row =
       align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
                  IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
-                             (((uint64_t)args.kernel_reciprocal));
-
+  uint64_t image_one_pad_per_row = (uint64_t)args.image.width *
+          (uint64_t)args.image.channels +(uint64_t)args.image.pad_width *
+          (uint64_t)args.image.channels;
+
+  uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width *
+          (uint64_t)args.image.channels, 32);
+  uint64_t result_addr_row =
+          (result_amount_align_32 << 32) | output_physical_address;
+  uint64_t row_padding_down =
+          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+  uint64_t kernel_width_sub1 =
+          (uint64_t)args.kernel.width - 1;
+  uint64_t kernel_padding_step = row_padding_down |
+          ((uint64_t)args.image.pad_height << 16) |
+          ((uint64_t)args.kernel.stride_h << 24) |
+          ((uint64_t)kernel_width_sub1<<32) |
+          ((uint64_t)args.kernel.height << 40) |
+          ((uint64_t)(args.kernel.height-1) << 48);
+  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
+          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t result_size_calcu_height = (output_height - 1) |
+          ((output_width - 1) << 16) | (image_calcu_height << 32);
+  uint64_t col_padding_down = ((uint64_t)args.image.width +
+          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+
+  uint64_t image_row_col_padding_down =
+          image_amount_per_row | (col_padding_down << 32);
+  uint64_t image_rowXpadding_h =
+          image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_rowXstep_h =
+          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t image_rowXpad_h_rowXstep_h =
+          image_rowXpadding_h | (image_rowXstep_h << 32);
+  uint64_t channelXpad_w =
+          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+  uint64_t channelXstep_w =
+          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+  uint64_t channelXpad_w_channelXstep_w =
+          channelXpad_w | (channelXstep_w << 32);
+  uint64_t filter_row_align =
+      C_align_32 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align = C_align_32 *
+          (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
+  uint64_t mult_factor = 0;
+  float average_reciprocal = args.kernel_reciprocal;
+  uint32_t* kernel_reciprocal;
+  kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal));
+  if (args.mode == 1)
+    mult_factor = (uint64_t)(*kernel_reciprocal) |
+            ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
+  else
+    mult_factor =
+            (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
     ret = -EIO;
@@ -433,41 +444,21 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-
-  // reg_writeq(reg_ActivationArgs,
-  //            REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  // reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
-  reg_writeq(cmd, REG_POOLING_CMD);
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, 0x808);
+  reg_writeq(result_addr_row, 0x810);
+  reg_writeq(kernel_padding_step, 0x818);
+  reg_writeq(result_size_calcu_height, 0x820);
+  reg_writeq((uint64_t)args.image.channels, 0x828);
+  reg_writeq(image_row_col_padding_down, 0x830);
+  reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
+  reg_writeq(mult_factor, 0x840);  // dw donot care
+  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
+  if (args.mode == 1)
+    cmd = (uint64_t)4;
+  else
+    cmd = (uint64_t)8;
+  reg_writeq(cmd, 0x800);
 
   DLOG << "before reg poll";
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
@@ -478,14 +469,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
   }
   DLOG << "after reg poll";
 
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  //  output_scale = (output_scale << 32) | (output_scale >> 32);
-  //  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  //  active_args.activation_type = NONE;
-  //  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
   pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
 
   return ret;
@@ -518,19 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
   int ret = 0;
-  uint64_t output_scale = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // ActivationArgs active_args;
-  // active_args.activation_type = args.output.activation.activation_type;
-  // active_args.leaky_relu_negative_slope =
-  //     args.output.activation.leaky_relu_negative_slope;
-  //  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                      active_args.leaky_relu_negative_slope;
-  // DLOG << "    activation_type:" << active_args.activation_type
-  //     << "    leaky_relu_negative_slope:"
-  //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "    reg_ActivationArgs:" << reg_ActivationArgs;
+uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
 
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
@@ -540,18 +511,47 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
     return ret;
   }
 
-  // reg_writeq(reg_ActivationArgs,
-  //          REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
 
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
-  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
-  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
-  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
-  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
-  reg_writeq(args.driver.cmd, REG_EW_CMD);
+  uint64_t image0_physical_address = 0;
+  uint64_t image1_physical_address = 0;
+  uint64_t image_physical_address = 0;
+  uint64_t output_physical_address = 0;
+  image0_physical_address = vaddr_to_paddr(args.image0.address);
+  image1_physical_address = vaddr_to_paddr(args.image1.address);
+  image_physical_address =
+          image0_physical_address | (image1_physical_address << 32);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  uint64_t image_amount_per_row =
+          align_to_x((uint64_t)args.image0.width *
+          (uint64_t)args.image0.channels, IMAGE_ALIGNMENT);
+  uint64_t result_addr_row =
+          output_physical_address | (image_amount_per_row << 32);
+  uint64_t kernel_padding_step = 0;
+  kernel_padding_step = ((uint64_t)args.image0.height * 2) |
+          ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48);
+  uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) |
+          ((image_amount_per_row / 32 - 1) << 16) |
+          (((uint64_t)args.image0.height * 2) << 32);
+  uint64_t image_row_col_padding_down = image_amount_per_row |
+          (image_amount_per_row << 32);
+  float  quantParam = (args.output.scale_address)[0];
+  uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
+  uint64_t ew_scale_mult_factor = (*ew_scale) |
+          ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, 0x808);
+  reg_writeq(result_addr_row, 0x810);
+  reg_writeq(kernel_padding_step, 0x818);
+  reg_writeq(result_size_calcu_height, 0x820);
+  reg_writeq(32, 0x828);
+  reg_writeq(image_row_col_padding_down, 0x830);
+  reg_writeq(((image_amount_per_row*2) << 32), 0x838);
+  reg_writeq(ew_scale_mult_factor, 0x840);  // dw donot care
+  reg_writeq(((uint64_t)32 << 32), 0x848);
+  reg_writeq(0, 0x858);
+  uint64_t cmd = 0;
+  cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8);
+  reg_writeq(cmd, 0x800);
 
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
     g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
@@ -560,12 +560,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
     PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
   }
 
-  // output_scale = reg_readq(REG_SCALE_PARAMETER);
-  // output_scale = (output_scale << 32) | (output_scale >> 32);
-  // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  // active_args.activation_type = NONE;
-  // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
   pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
   return ret;
 #endif
@@ -870,7 +864,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #endif
   }
 
-  if (sub_conv_num > 1) {
+  /*if (sub_conv_num > 1) {
     float max_scale = -1.0f;
 #ifdef COST_TIME_PRINT
     gettimeofday(&start, NULL);
@@ -894,19 +888,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
               << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
               << std::endl;
 #endif
-
-    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-    /*#ifdef COST_TIME_PRINT
-    gettimeofday(&start,NULL);
-    #endif
-        //deconv_post_process(args);
-    #ifdef COST_TIME_PRINT
-        gettimeofday(&end,NULL);
-     dif_sec = end.tv_sec - start.tv_sec;
-     dif_usec = end.tv_usec - start.tv_usec;
-      std::cout << "deconv_post_process  " << "    cost time: "  <<
-    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
-  }
+  }*/
 
   return 0;
 }  // ComputeFpgaDeconv
@@ -940,8 +922,8 @@ int ComputeDWConv(const struct DWconvArgs &args) {
        << "   image_width:" << args.image.width
        << "   pad_height:" << args.image.pad_height
        << "   pad_width:" << args.image.pad_width;
-  DLOG << "   filter_address:" << args.filter_address
-       << "   bias_address:" << args.bias_address;
+  DLOG << "   filter_address:" << args.filter_address;
+       //<< "   bias_address:" << args.bias_address;
   DLOG << "   kernel_height:" << args.kernel.height
        << "   kernel_width:" << args.kernel.width
        << "   stride_h:" << args.kernel.stride_h
@@ -951,11 +933,10 @@ int ComputeDWConv(const struct DWconvArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
   DLOG << "DWConv";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   // return 0;
-  uint64_t output_scale = 0;
   uint64_t timer_cnt = 0;
   int ret = 0;
-  // uint64_t cmd = args.relu_enabled;
   uint64_t cmd = 0;
   uint64_t image_physical_address = 0;
   uint64_t output_physical_address = 0;
@@ -966,57 +947,69 @@ int ComputeDWConv(const struct DWconvArgs &args) {
   output_physical_address = vaddr_to_paddr(args.output.address);
   filter_physical_address = vaddr_to_paddr(args.filter_address);
   bias_physical_address = vaddr_to_paddr(args.bias_address);
-  uint64_t filter_N_align =
-      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t filter_amount_per_row_align =
-      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = filter_N_align *
-                                     (uint64_t)args.kernel.width *
-                                     (uint64_t)args.kernel.height;
-  uint64_t filter_amount_align =
-      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
-
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w +
-       1) *
-      args.sub_conv_num);
+  uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
+  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
+  uint64_t output_height = (uint64_t)
+          ((args.image.height + args.image.pad_height * 2 -
+          args.kernel.height) / args.kernel.stride_h +1);
+  uint64_t output_width = (uint64_t)
+          (((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w + 1) * args.sub_conv_num);
 
   uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
+          align_to_x((uint64_t)args.image.width *
+          (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
   uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-
+          (uint64_t)args.image.width * (uint64_t)args.image.channels +
+          (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+
+  uint64_t result_amount_align_32 = align_to_x(
+          (uint64_t)output_width * (uint64_t)args.image.channels, 32);
+  uint64_t result_addr_row =
+          (result_amount_align_32 << 32) | output_physical_address;
+  uint64_t row_padding_down =
+          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
+  uint64_t kernel_padding_step = row_padding_down |
+          ((uint64_t)args.image.pad_height << 16) |
+          ((uint64_t)args.kernel.stride_h << 24) |
+          ((uint64_t)kernel_width_sub1<<32) |
+          ((uint64_t)args.kernel.height << 40) |
+          ((uint64_t)(args.kernel.height-1) << 48);
+  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
+          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t result_size_calcu_height = (output_height - 1) |
+          ((output_width - 1) << 16) | (image_calcu_height << 32);
+  uint64_t col_padding_down = ((uint64_t)args.image.width +
+          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+
+  uint64_t image_row_col_padding_down =
+          image_amount_per_row | (col_padding_down << 32);
+  uint64_t image_rowXpadding_h =
+          image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_rowXstep_h =
+          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t image_rowXpad_h_rowXstep_h =
+          image_rowXpadding_h | (image_rowXstep_h << 32);
+  uint64_t channelXpad_w =
+          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+  uint64_t channelXstep_w =
+          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+  uint64_t channelXpad_w_channelXstep_w =
+          channelXpad_w | (channelXstep_w << 32);
+
+  uint64_t filter_row_align =
+          C_align_64 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align = C_align_64 *
+          (uint64_t)args.kernel.width *
+          (uint64_t)args.kernel.height;
+  uint64_t filter_amount_align =
+          sub_filter_amount_align * (uint64_t)args.sub_conv_num;
+  uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
+          (sub_filter_amount_align << 32) |
+          (((uint64_t)args.sub_conv_num -1) << 48);
+  uint64_t channel_parameter =
+          (uint64_t)args.image.channels | (C_align_64 << 16);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
     ret = -EIO;
@@ -1024,73 +1017,31 @@ int ComputeDWConv(const struct DWconvArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-
-  /*restart scale*/
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq((bias_physical_address << 32 | filter_physical_address),
-             REG_DWCONV_FILTER_BASE_ADDR);
-  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
-             REG_DWCONV_FILTER_SHAPE);
-  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
-             REG_DWCONV_FILTER_SUBNUMBER);
-  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
-
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-
-  /*SDK刷Cache保证数据一致性*/
-
-  reg_writeq(cmd, REG_DWCONV_CMD);
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, 0x808);
+  reg_writeq(result_addr_row, 0x810);
+  reg_writeq(kernel_padding_step, 0x818);
+  reg_writeq(result_size_calcu_height, 0x820);
+  reg_writeq(channel_parameter, 0x828);
+  reg_writeq(image_row_col_padding_down, 0x830);
+  reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
+  reg_writeq(0, 0x840);
+  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
+  reg_writeq(filter_physical_address, 0x850);
+  reg_writeq(filter_param, 0x858);
+  reg_writeq(((bias_physical_address+C_align_64*4) |
+  (bias_physical_address << 32)), 0x860);
+  cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
+  reg_writeq(cmd, 0x800);
 
   DLOG << "before reg poll";
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
     g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
     ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
+    DLOG << "DWconv Wait Irq Timeout!";
     PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
   }
   DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  DLOG << "output_scale:" << output_scale;
   pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
   return ret;
 #endif
diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp
old mode 100644
new mode 100755
index 911704965aac3b6897b70dc60cb23fb4f3e59979..b7ce4d32474465988f0e2c02763d21bfdf9a7530
--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   uint64_t i = 0;
   /*timeout精确性待确认*/
   int64_t timeout = time * 6;
-  usleep(1);
 
   for (i = 0; i < timeout; i++) {
+    usleep(1);
     if (val == reg_readq(reg)) {
       break;
     }
diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h
old mode 100644
new mode 100755
index a798d54459b86f67a28c158dc30c82131ea48626..a767cd2606bb351b42f8d2a6bc944c66a2fa39a7
--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
@@ -211,6 +211,7 @@ struct ConcatArgs {
   uint32_t out_channel;
   uint32_t height;
   uint32_t width;
+  std::vector<std::shared_ptr<char>> vector_concat_space;
 };
 
 struct SplitConvArgs {
diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
old mode 100644
new mode 100755
index 951fbb5f3708bf511bfcbbb0669fb7a56a4eb7c4..56cc8927f035b16963b639bc960b20532b931f44
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
@@ -37,7 +37,7 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
   int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
                           -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
                           79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
-                          134, -204, -188, 220, 204, -281, -395, 296,  441};
+                          134, -204, -188, 220, 204, -281, -395, 296,  411};
 
   int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
                            0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
old mode 100644
new mode 100755
index 716531fcab47252c86486d2cb1f325ca97423935..8442eef8b2314d5035d673c12dd87590cfb8064d
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
   concatArgs.channel_num = channel_num;
   concatArgs.height = height;
   concatArgs.width = width;
+
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.images_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.scales_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.channel_num), deleter));
+
   param->SetFpgaArgs(concatArgs);
   return true;
 }
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
old mode 100644
new mode 100755
index 43b9355c99be4a22781cac10309a24c7dd3ac76c..57ccf9f00d9e4ab04bbed16af8b02e4aaa537847
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
-
+#include <math.h>
 #include "operators/kernel/elementwise_add_kernel.h"
 
-#include <string>
-#include "fpga/V2/api.h"
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -60,10 +57,36 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
   return true;
 }
 
+void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
+  int inputc = ewaddArgs.image0.channels;
+  int inputh = ewaddArgs.image0.height;
+  int inputw = ewaddArgs.image0.width;
+  float inScale0 =
+          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+  float inScale1 =
+          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+  float outScale =
+          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
+  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
+  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
+  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+  int datasize = inputc * inputh * inputw;
+  float const0 = inScale0 / outScale;
+  float const1 = inScale1 / outScale;
+  fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
+  fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
+  for (int i = 0; i < datasize; i++) {
+    float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
+    int tmpI = static_cast<int>(round(tmpF));
+    outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI)));
+  }
+  fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
+}
 template <>
 void ElementwiseAddKernel<FPGA, float>::Compute(
     const ElementwiseAddParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  ComputeCPUEWAdd(param.FpgaArgs());
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
old mode 100644
new mode 100755
index 6d5ad505732f58cfc9a50f8627a07956cd96d45c..de603418742da5b9672259a1bb414567853a8cb5
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-
+#include <math.h>
 #include "operators/kernel/elementwise_add_relu_kernel.h"
 
 namespace paddle_mobile {
@@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   return true;
 }
 
+void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
+  int inputc = ewaddArgs.image0.channels;
+  int inputh = ewaddArgs.image0.height;
+  int inputw = ewaddArgs.image0.width;
+  float inScale0 =
+          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+  float inScale1 =
+          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+  float outScale =
+          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
+  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
+  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
+  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+  int datasize = inputc * inputh * inputw;
+  float const0 = inScale0 / outScale;
+  float const1 = inScale1 / outScale;
+  fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
+  fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
+  for (int i = 0; i < datasize; i++) {
+    float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
+    int tmpI = static_cast<int>(round(tmpF));
+    outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI)));
+  }
+  fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
+}
+
 template <>
 void ElementwiseAddReluKernel<FPGA, float>::Compute(
     const ElementwiseAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  ComputeCPUEWAddRelu(param.FpgaArgs());
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
old mode 100644
new mode 100755
index fcf0889b4a66919efc677e211a1da453fd761de4..c7cd6575e4010f7ba9aa12882a8968cd558049b9
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -110,7 +110,27 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
     }
   }
   output->Resize(framework::make_ddim(shape));
+
+  bool reshapeNeedFlg = 1;
   if (output->dims() == input->dims()) {
+    reshapeNeedFlg = 0;
+  } else if (output->dims().size() != input->dims().size()) {
+    auto inputdimsize = input->dims().size();
+    auto outputdimsize = output->dims().size();
+    int smallersize =
+            inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
+    int i = 0;
+    for (i = 0; i < smallersize; i++) {
+      if ((input->dims())[i] != (output->dims())[i])
+        break;
+    }
+    if (i == smallersize) {
+      reshapeNeedFlg = 0;
+    }
+  }
+  if (reshapeNeedFlg) {
+    reshape(input, output);
+  } else {
     DLOG << "No need to reshape";
     output->ShareDataWith(*input);
     framework::LoD lod = input->lod();
@@ -118,9 +138,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
     output->scale[0] = input->scale[0];
     return;
   }
-
-  reshape(input, output);
-  //
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
index 194fd5a30565b866ca702b296981d0b8302a1c16..44aae4be321db6797d3450cec7c2f159b5e5124b 100644
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
@@ -48,7 +48,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
 template <>
 void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
   fpga::PerformBypass(param.FpgaArgs());
-  param.Out()->scale[0] = 127.0;
+  param.Out()->scale[0] = 1.0;
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
old mode 100644
new mode 100755
index a1500ecdb0246d4c7235de490437945ec381d5a4..d32dddb3072b9b7181da4d871fe6cea37db5de04
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
   }
   return true;
 }
+
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   // Only support slicing in channel dimension
@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
 
   auto input = param.input_;
   auto output = param.output_;
+  int H = input->dims()[2];
+  int W = input->dims()[3];
   int HW = input->dims()[2] * input->dims()[3];
   int channel = input->dims()[1];
   auto input_ptr = input->data<int8_t>();
@@ -53,10 +56,32 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   end = end > channel ? channel : end;
   int len = end - start;
   size_t size = len * sizeof(int8_t);
+  DLOG << input->fpga_data_num;
+  fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
+  DLOG << output->fpga_data_num;
+  fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
+  int unalignedWC = len * W;
+  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
 
-  for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+  if (unalignedWC != alignedWC) {
+      auto tmpOutput = reinterpret_cast<int8_t*>
+              (fpga::fpga_malloc(len*HW * sizeof(int8_t)));
+      for (int i = 0; i < HW; i++) {
+          memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+      }
+      for (int i = 0; i < H; i++) {
+          for (int j = 0; j < unalignedWC; j++) {
+              *(output_ptr + alignedWC * i + j) =
+                      *(tmpOutput + unalignedWC * i + j);
+          }
+      }
+      fpga::fpga_free(tmpOutput);
+  } else {
+      for (int i = 0; i < HW; i++) {
+          memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      }
   }
+  fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp
index 11fce286051dbaa158ae9db917452c4987122f32..4f8b7a7b3000a9130dac1c755a3beb16e7c98c59 100644
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ b/mobile/src/operators/math/depthwise_conv3x3.cpp
@@ -150,7 +150,8 @@ void DepthwiseConv3x3S1<float, float>(const framework::Tensor &input,
   const int out_image_size = output_h * output_w;
   const int valid_h_start = padding_h;
   const int valid_h_end = output_h - valid_h_start;
-  const int valid_h = valid_h_end - valid_h_start;
+  const int valid_h =
+      valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0;
   const int valid_w_start = padding_w;
   const int valid_w_end = output_w - valid_w_start;
   const int valid_w = valid_w_end - valid_w_start;
@@ -631,7 +632,7 @@ void DepthwiseConv3x3S1<float, float>(const framework::Tensor &input,
       }
     }
     // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
+    for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) {
       DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
                                       input_w, padding_h, padding_w, output_w,
                                       output_ptr, _ker);
@@ -659,7 +660,8 @@ void DepthwiseConv3x3S2<float, float>(const framework::Tensor &input,
   const int valid_h_start = (padding_h + 1) / 2;
   const int valid_h_end =
       std::max((input_h + padding_h - 1) / 2, valid_h_start);
-  const int valid_h = valid_h_end - valid_h_start;
+  const int valid_h =
+      valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0;
   const int valid_w_start = (padding_w + 1) / 2;
   const int valid_w_end =
       std::max((input_w + padding_w - 1) / 2, valid_w_start);
@@ -1045,7 +1047,7 @@ void DepthwiseConv3x3S2<float, float>(const framework::Tensor &input,
       }
     }
     // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
+    for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) {
       DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
                                       input_w, padding_h, padding_w, output_w,
                                       output_ptr, _ker);